Reference genome
Copy $ cd /home/USER/db/refanno
* Latest stable version available at the time of writing
https://www.gencodegenes.org/human/release_33.html ; Human v33 (GRCh38)
Copy # Genome sequence, primary assembly
$ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/GRCh38.primary_assembly.genome.fa.gz
$ gunzip GRCh38.primary_assembly.genome.fa.gz
Copy # Comprehensive gene annotation
$ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/gencode.v33.annotation.gtf.gz
$ gunzip gencode.v33.annotation.gtf.gz
Copy # Transcript sequences
$ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/gencode.v33.transcripts.fa.gz
$ gunzip gencode.v33.transcripts.fa.gz
You can also retrieve mouse data from https://www.gencodegenes.org/mouse/ ; Mouse vM24 (GRCm38)
Copy $ sed 's/|ENSG.*//' gencode.v33.transcripts.fa > gencode.v33.transcripts.clean.fa
Copy $ grep ">" gencode.v33.transcripts.fa | head
> ENST00000456328.2 | ENSG00000223972.5 | OTTHUMG00000000961.2 | OTTHUMT00000362751.1 | DDX11L1-202 | DDX11L1 | 1657 | processed_transcript |
> ENST00000450305.2 | ENSG00000223972.5 | OTTHUMG00000000961.2 | OTTHUMT00000002844.2 | DDX11L1-201 | DDX11L1 | 632 | transcribed_unprocessed_pseudogene |
> ENST00000488147.1 | ENSG00000227232.5 | OTTHUMG00000000958.1 | OTTHUMT00000002839.1 | WASH7P-201 | WASH7P | 1351 | unprocessed_pseudogene |
> ENST00000619216.1 | ENSG00000278267.1 | - | - | MIR6859-1-201 | MIR6859-1 | 68 | miRNA |
> ENST00000473358.1 | ENSG00000243485.5 | OTTHUMG00000000959.2 | OTTHUMT00000002840.1 | MIR1302-2HG-202 | MIR1302-2HG | 712 | lncRNA |
> ENST00000469289.1 | ENSG00000243485.5 | OTTHUMG00000000959.2 | OTTHUMT00000002841.2 | MIR1302-2HG-201 | MIR1302-2HG | 535 | lncRNA |
> ENST00000607096.1 | ENSG00000284332.1 | - | - | MIR1302-2-201 | MIR1302-2 | 138 | miRNA |
> ENST00000417324.1 | ENSG00000237613.2 | OTTHUMG00000000960.1 | OTTHUMT00000002842.1 | FAM138A-201 | FAM138A | 1187 | lncRNA |
> ENST00000461467.1 | ENSG00000237613.2 | OTTHUMG00000000960.1 | OTTHUMT00000002843.1 | FAM138A-202 | FAM138A | 590 | lncRNA |
> ENST00000606857.1 | ENSG00000268020.3 | OTTHUMG00000185779.1 | OTTHUMT00000471235.1 | OR4G4P-201 | OR4G4P | 840 | unprocessed_pseudogene |
$ grep ">" gencode.v33.transcripts.clean.fa | head
> ENST00000456328.2
> ENST00000450305.2
> ENST00000488147.1
> ENST00000619216.1
> ENST00000473358.1
> ENST00000469289.1
> ENST00000607096.1
> ENST00000417324.1
> ENST00000461467.1
> ENST00000606857.1
Create tabular info from GTF
We will use awk
and sed
to convert the gene-level and transcript-level information stored in GTF file into tabular file for easy access
Copy cat gencode.v33.annotation.gtf | \
awk 'BEGIN{FS="\t"}{split($9,a,";"); if($3~"gene") print a[1]"\t"a[3]"\t"$1":"$4"-"$5"\t"a[2]"\t"$7}' | \
sed 's/gene_id "//' | \
sed 's/gene_name "//' | \
sed 's/gene_type "//' | \
sed 's/"//g' | \
awk 'BEGIN{FS="\t"}{split($3,a,"[:-]"); print $1"\t"$2"\t"a[1]"\t"a[2]"\t"a[3]"\t"$4"\t"$5"\t"a[3]-a[2];}' | \
sort -k3,3 -k4,4n -k5,5n | \
sed "1i\GeneID\tGeneSymbol\tChromosome\tStart\tEnd\tClass\tStrand\tLength" \
> gencode.v33.annotation_genes.txt
gencode.v33.annotation_genes.txt
Copy GeneID GeneSymbol Chromosome Start End Class Strand Length
ENSG00000223972.5 DDX11L1 chr1 11869 14409 transcribed_unprocessed_pseudogene + 2540
ENSG00000227232.5 WASH7P chr1 14404 29570 unprocessed_pseudogene - 15166
ENSG00000278267.1 MIR6859-1 chr1 17369 17436 miRNA - 67
ENSG00000243485.5 MIR1302-2HG chr1 29554 31109 lncRNA + 1555
ENSG00000284332.1 MIR1302-2 chr1 30366 30503 miRNA + 137
ENSG00000237613.2 FAM138A chr1 34554 36081 lncRNA - 1527
ENSG00000268020.3 OR4G4P chr1 52473 53312 unprocessed_pseudogene + 839
ENSG00000240361.2 OR4G11P chr1 57598 64116 transcribed_unprocessed_pseudogene + 6518
ENSG00000186092.6 OR4F5 chr1 65419 71585 protein_coding + 6166
Copy cat gencode.v33.annotation.gtf | \
awk 'BEGIN{FS="\t"}{split($9,a,";"); if($3~"transcript") print a[1]"\t"a[4]"\t"$1":"$4"-"$5"\t"a[2]"\t"a[6]"\t"a[5]"\t"$7}' | \
sed 's/gene_id "//' | \
sed 's/gene_name "//' | \
sed 's/transcript_id "//' | \
sed 's/transcript_name "//' | \
sed 's/transcript_type "//' | \
sed 's/"//g' | \
awk 'BEGIN{FS="\t"}{split($3,a,"[:-]"); print $1"\t"$2"\t"a[1]"\t"a[2]"\t"a[3]"\t"$4"\t"$5"\t"$6"\t"$7"\t"a[3]-a[2];}' | \
sort -k3,3 -k4,4n -k5,5n | \
sed "1i\GeneID\tGeneSymbol\tChromosome\tStart\tEnd\tTranscriptID\tTranscriptName\tClass\tStrand\tLength" \
> gencode.v33.annotation_transcripts.txt
gencode.v33.annotation_transcripts.txt
Copy GeneID GeneSymbol Chromosome Start End TranscriptID TranscriptName Class Strand Length
ENSG00000223972.5 DDX11L1 chr1 11869 14409 ENST00000456328.2 DDX11L1-202 processed_transcript + 2540
ENSG00000223972.5 DDX11L1 chr1 12010 13670 ENST00000450305.2 DDX11L1-201 transcribed_unprocessed_pseudogene + 1660
ENSG00000227232.5 WASH7P chr1 14404 29570 ENST00000488147.1 WASH7P-201 unprocessed_pseudogene - 15166
ENSG00000278267.1 MIR6859-1 chr1 17369 17436 ENST00000619216.1 MIR6859-1-201 miRNA - 67
ENSG00000243485.5 MIR1302-2HG chr1 29554 31097 ENST00000473358.1 MIR1302-2HG-202 lncRNA + 1543
ENSG00000243485.5 MIR1302-2HG chr1 30267 31109 ENST00000469289.1 MIR1302-2HG-201 lncRNA + 842
ENSG00000284332.1 MIR1302-2 chr1 30366 30503 ENST00000607096.1 MIR1302-2-201 miRNA + 137
ENSG00000237613.2 FAM138A chr1 34554 36081 ENST00000417324.1 FAM138A-201 lncRNA - 1527
ENSG00000237613.2 FAM138A chr1 35245 36073 ENST00000461467.1 FAM138A-202 lncRNA - 828
Obtaining softwares
Copy $ cd /home/USER/tools
You can place the executables or make symbolic links in a location recognisable by$PATH
or make changes to$PATH
to avoid needing to specify full paths when using the programs. All codes provided assume executables can be found in $PATH
.
* Latest stable version available at the time of writing
FastQC
https://www.bioinformatics.babraham.ac.uk/projects/fastqc ; v0.11.9
Download and unzip the file. Change fastqc
permission from 664 (-rw-rw-r--
) to 755 (-rwxr-xr-x
)
Copy $ wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip
$ unzip fastqc_v0.11.9.zip
$ cd FastQC
$ chmod 755 fastqc
Copy $ ./fastqc --help
FastQC - A high throughput sequence QC analysis tool
SYNOPSIS
fastqc seqfile1 seqfile2 .. seqfileN
fastqc [-o output dir] [--(no)extract] [-f fastq | bam | sam]
[-c contaminant file] seqfile1 .. seqfileN
DESCRIPTION
FastQC reads a set of sequence files and produces from each one a quality
control report consisting of a number of different modules, each one of
which will help to identify a different potential type of problem in your
data.
If no files to process are specified on the command line then the program
will start as an interactive graphical application. If files are provided
on the command line then the program will run with no user interaction
required. In this mode it is suitable for inclusion into a standardised
analysis pipeline.
BBTools
https://jgi.doe.gov/data-and-tools/bbtools ; v38.82
Copy $ wget -O BBMap_38.82.tar.gz 'https://downloads.sourceforge.net/project/bbmap/BBMap_38.82.tar.gz?r=https%3A%2F%2Fsourceforge.net%2Fprojects%2Fbbmap%2Ffiles%2FBBMap_38.82.tar.gz%2Fdownload&ts=1587572478'
$ tar xvf BBMap_38.82.tar.gz
$ cd bbmap
Copy $ ./bbduk.sh
Written by Brian Bushnell
Last modified March 24, 2020
Description: Compares reads to the kmers in a reference dataset, optionally
allowing an edit distance. Splits the reads into two outputs - those that
match the reference, and those that don 't. Can also trim (remove) the matching
parts of the reads rather than binning the reads.
Please read bbmap/docs/guides/BBDukGuide.txt for more information.
Usage: bbduk.sh in=<input file> out=<output file> ref=<contaminant files>
Input may be stdin or a fasta or fastq file, compressed or uncompressed.
If you pipe via stdin/stdout, please include the file type; e.g. for gzipped
fasta input, set in=stdin.fa.gz
Salmon
https://github.com/COMBINE-lab/salmon ; v1.2.0
Copy $ wget https://github.com/COMBINE-lab/salmon/releases/download/v1.2.1/salmon-1.2.1_linux_x86_64.tar.gz
$ tar xvf salmon-1.2.1_linux_x86_64.tar.gz
$ mv salmon-latest_linux_x86_64 salmon-1.2.1
$ cd salmon-1.2.1/bin
Copy $ ./salmon
salmon v1.2.1
Usage: salmon -h | --help or
salmon -v | --version or
salmon -c | --cite or
salmon [--no-version-check] < COMMAND > [-h | options]
Commands:
index Create a salmon index
quant Quantify a sample
alevin single cell analysis
swim Perform super-secret operation
quantmerge Merge multiple quantifications into a single file
kallisto
https://github.com/pachterlab/kallisto ; v0.46.2
Kallisto is phasing out HDF5 at the time of writing, binaries for this release (v0.46.2) are compiled with HDF5 built in
Copy $ wget https://github.com/pachterlab/kallisto/releases/download/v0.46.2/kallisto_linux-v0.46.2.tar.gz
$ tar zxf kallisto_linux-v0.46.2.tar.gz
$ mv kallisto kallisto-v0.46.2
$ cd kallisto-v0.46.2
Copy $ ./kallisto
kallisto 0.46.2
Usage: kallisto < CM D > [arguments] ..
Where < CM D > can be one of:
index Builds a kallisto index
quant Runs the quantification algorithm
bus Generate BUS files for single-cell data
pseudo Runs the pseudoalignment step
merge Merges several batch runs
h5dump Converts HDF5-formatted results to plaintext
inspect Inspects and gives information about an index
version Prints version information
cite Prints citation information
Running kallisto < CM D > without arguments prints usage information for < CM D >
STAR
https://github.com/alexdobin/STAR ; v2.7.3a
Download and unpack the file. Use make
to compile STAR
Copy $ wget -O STAR-2.7.3a.tar.gz https://github.com/alexdobin/STAR/archive/2.7.3a.tar.gz
$ tar xvf STAR-2.7.3a.tar.gz
$ cd STAR-2.7.3a/source
$ make STAR
Copy $ ./STAR
Usage: STAR [options]... --genomeDir /path/to/genome/index/ --readFilesIn R1.fq R2.fq
Spliced Transcripts Alignment to a Reference (c) Alexander Dobin, 2009-2019
For more details see:
< https://github.com/alexdobin/STAR >
< https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf >
To list all parameters, run STAR --help
RSEM
https://github.com/deweylab/RSEM ; v1.3.3
Copy $ wget -O RSEM-1.3.3.tar.gz https://github.com/deweylab/RSEM/archive/v1.3.3.tar.gz
$ tar zxf RSEM-1.3.3.tar.gz
$ cd RSEM-1.3.3
By default, RSEM executables are installed to /usr/local/bin
. You can change the installation location by setting DESTDIR
and/or prefix
variables. The RSEM executables will be installed to ${DESTDIR}${prefix}/bin
. The default values of DESTDIR
and prefix
are DESTDIR=
and prefix=/usr/local
.
Copy $ make
$ make install
Copy $ rsem-calculate-expression
Invalid number of arguments!
NAME
rsem-calculate-expression - Estimate gene and isoform expression from
RNA-Seq data.
SYNOPSIS
rsem-calculate-expression [options] upstream_read_file( s ) reference_name sample_name
rsem-calculate-expression [options] --paired-end upstream_read_file( s ) downstream_read_file( s ) reference_name sample_name
rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name
Preparing indices
Copy $ cd /home/USER/db/refanno
Salmon index
SAF genome index (recommended)
Full genome is used as decoy, largde index but least prone to spurious alignments
Indexing takes under 40 minutes running on 6 threads, and the index size is 17 GB
Copy $ grep "^>" GRCh38.primary_assembly.genome.fa | cut -d " " -f 1 > GRCh38.decoys.txt
$ sed -i 's/>//g' GRCh38.decoys.txt
$ cat gencode.v33.transcripts.fa GRCh38.primary_assembly.genome.fa | gzip > GRCh38.gentrome.fa.gz
$ salmon index -p 6 --gencode -t GRCh38.gentrome.fa.gz \
-d GRCh38.decoys.txt -i gencode.v33_decoys_salmon-1.2.0
Alternatively, cDNA-only index
No decoy used, small index but most prone to possible spurious alignments
Indexing takes under 3 minutes running on 6 threads, and the index size is 782 MB
Copy $ salmon index -p 6 --gencode -t gencode.v33.transcripts.fa \
-i gencode.v33_salmon-1.2.0
kallisto index
Use header-cleaned transcript file to create index
Indexing takes under 6 minutes running on a single thread, and the index size is 3.0G
Copy $ kallisto index -i gencode.v33_kallisto-0.46.2 gencode.v33.transcripts.clean.fa
STAR index
The --sjdbOverhang
option is set as 150 in this tutorial. This length should be equal to the ReadLength-1
of the fastq files. The STAR manual suggests the ideal value is max(ReadLength)-1
in case of reads of varying length. In most cases, the default value of 100 will work as well as the ideal value.
Indexing takes about 70 minutes running on 6 threads, and the index size is 28 GB
Copy $ mkdir gencode.v33_star-2.7.3a
$ STAR --runMode genomeGenerate --runThreadN 6 \
--genomeDir gencode.v33_star-2.7.3a \
--genomeFastaFiles GRCh38.primary_assembly.genome.fa \
--sjdbGTFfile gencode.v33.annotation.gtf --sjdbOverhang 150
RSEM index
Indexing takes under 2 minutes running on 6 threads, and the index size is 1.6G
Copy $ rsem-prepare-reference -p 6 --gtf gencode.v33.annotation.gtf \
GRCh38.primary_assembly.genome.fa gencode.v33_rsem-1.3.3