Softwares and databases

Reference genome

$ cd /home/USER/db/refanno

Obtain from GENCODE

* Latest stable version available at the time of writing

https://www.gencodegenes.org/human/release_33.html; Human v33 (GRCh38)

# Genome sequence, primary assembly
$ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/GRCh38.primary_assembly.genome.fa.gz
$ gunzip GRCh38.primary_assembly.genome.fa.gz
# Comprehensive gene annotation
$ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/gencode.v33.annotation.gtf.gz
$ gunzip gencode.v33.annotation.gtf.gz
# Transcript sequences
$ wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/gencode.v33.transcripts.fa.gz
$ gunzip gencode.v33.transcripts.fa.gz

You can also retrieve mouse data from https://www.gencodegenes.org/mouse/; Mouse vM24 (GRCm38)

Clean-up fasta header

$ sed 's/|ENSG.*//' gencode.v33.transcripts.fa > gencode.v33.transcripts.clean.fa
$ grep ">" gencode.v33.transcripts.fa | head
>ENST00000456328.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000362751.1|DDX11L1-202|DDX11L1|1657|processed_transcript|
>ENST00000450305.2|ENSG00000223972.5|OTTHUMG00000000961.2|OTTHUMT00000002844.2|DDX11L1-201|DDX11L1|632|transcribed_unprocessed_pseudogene|
>ENST00000488147.1|ENSG00000227232.5|OTTHUMG00000000958.1|OTTHUMT00000002839.1|WASH7P-201|WASH7P|1351|unprocessed_pseudogene|
>ENST00000619216.1|ENSG00000278267.1|-|-|MIR6859-1-201|MIR6859-1|68|miRNA|
>ENST00000473358.1|ENSG00000243485.5|OTTHUMG00000000959.2|OTTHUMT00000002840.1|MIR1302-2HG-202|MIR1302-2HG|712|lncRNA|
>ENST00000469289.1|ENSG00000243485.5|OTTHUMG00000000959.2|OTTHUMT00000002841.2|MIR1302-2HG-201|MIR1302-2HG|535|lncRNA|
>ENST00000607096.1|ENSG00000284332.1|-|-|MIR1302-2-201|MIR1302-2|138|miRNA|
>ENST00000417324.1|ENSG00000237613.2|OTTHUMG00000000960.1|OTTHUMT00000002842.1|FAM138A-201|FAM138A|1187|lncRNA|
>ENST00000461467.1|ENSG00000237613.2|OTTHUMG00000000960.1|OTTHUMT00000002843.1|FAM138A-202|FAM138A|590|lncRNA|
>ENST00000606857.1|ENSG00000268020.3|OTTHUMG00000185779.1|OTTHUMT00000471235.1|OR4G4P-201|OR4G4P|840|unprocessed_pseudogene|

$ grep ">" gencode.v33.transcripts.clean.fa | head
>ENST00000456328.2
>ENST00000450305.2
>ENST00000488147.1
>ENST00000619216.1
>ENST00000473358.1
>ENST00000469289.1
>ENST00000607096.1
>ENST00000417324.1
>ENST00000461467.1
>ENST00000606857.1

Create tabular info from GTF

We will use awk and sed to convert the gene-level and transcript-level information stored in GTF file into tabular file for easy access

cat gencode.v33.annotation.gtf | \
awk 'BEGIN{FS="\t"}{split($9,a,";"); if($3~"gene") print a[1]"\t"a[3]"\t"$1":"$4"-"$5"\t"a[2]"\t"$7}' | \
sed 's/gene_id "//' | \
sed 's/gene_name "//'| \
sed 's/gene_type "//' | \
sed 's/"//g' | \
awk 'BEGIN{FS="\t"}{split($3,a,"[:-]"); print $1"\t"$2"\t"a[1]"\t"a[2]"\t"a[3]"\t"$4"\t"$5"\t"a[3]-a[2];}' | \
sort -k3,3 -k4,4n -k5,5n | \
sed "1i\GeneID\tGeneSymbol\tChromosome\tStart\tEnd\tClass\tStrand\tLength" \
> gencode.v33.annotation_genes.txt
gencode.v33.annotation_genes.txt
GeneID	GeneSymbol	Chromosome	Start	End	Class	Strand	Length
ENSG00000223972.5	 DDX11L1	chr1	11869	14409	 transcribed_unprocessed_pseudogene	+	2540
ENSG00000227232.5	 WASH7P	chr1	14404	29570	 unprocessed_pseudogene	-	15166
ENSG00000278267.1	 MIR6859-1	chr1	17369	17436	 miRNA	-	67
ENSG00000243485.5	 MIR1302-2HG	chr1	29554	31109	 lncRNA	+	1555
ENSG00000284332.1	 MIR1302-2	chr1	30366	30503	 miRNA	+	137
ENSG00000237613.2	 FAM138A	chr1	34554	36081	 lncRNA	-	1527
ENSG00000268020.3	 OR4G4P	chr1	52473	53312	 unprocessed_pseudogene	+	839
ENSG00000240361.2	 OR4G11P	chr1	57598	64116	 transcribed_unprocessed_pseudogene	+	6518
ENSG00000186092.6	 OR4F5	chr1	65419	71585	 protein_coding	+	6166
cat gencode.v33.annotation.gtf | \
awk 'BEGIN{FS="\t"}{split($9,a,";"); if($3~"transcript") print a[1]"\t"a[4]"\t"$1":"$4"-"$5"\t"a[2]"\t"a[6]"\t"a[5]"\t"$7}' | \
sed 's/gene_id "//' | \
sed 's/gene_name "//'| \
sed 's/transcript_id "//' | \
sed 's/transcript_name "//' | \
sed 's/transcript_type "//' | \
sed 's/"//g' | \
awk 'BEGIN{FS="\t"}{split($3,a,"[:-]"); print $1"\t"$2"\t"a[1]"\t"a[2]"\t"a[3]"\t"$4"\t"$5"\t"$6"\t"$7"\t"a[3]-a[2];}' | \
sort -k3,3 -k4,4n -k5,5n | \
sed "1i\GeneID\tGeneSymbol\tChromosome\tStart\tEnd\tTranscriptID\tTranscriptName\tClass\tStrand\tLength" \
> gencode.v33.annotation_transcripts.txt
gencode.v33.annotation_transcripts.txt
GeneID	GeneSymbol	Chromosome	Start	End	TranscriptID	TranscriptName	Class	Strand	Length
ENSG00000223972.5	 DDX11L1	chr1	11869	14409	 ENST00000456328.2	 DDX11L1-202	 processed_transcript	+	2540
ENSG00000223972.5	 DDX11L1	chr1	12010	13670	 ENST00000450305.2	 DDX11L1-201	 transcribed_unprocessed_pseudogene	+	1660
ENSG00000227232.5	 WASH7P	chr1	14404	29570	 ENST00000488147.1	 WASH7P-201	 unprocessed_pseudogene	-	15166
ENSG00000278267.1	 MIR6859-1	chr1	17369	17436	 ENST00000619216.1	 MIR6859-1-201	 miRNA	-	67
ENSG00000243485.5	 MIR1302-2HG	chr1	29554	31097	 ENST00000473358.1	 MIR1302-2HG-202	 lncRNA	+	1543
ENSG00000243485.5	 MIR1302-2HG	chr1	30267	31109	 ENST00000469289.1	 MIR1302-2HG-201	 lncRNA	+	842
ENSG00000284332.1	 MIR1302-2	chr1	30366	30503	 ENST00000607096.1	 MIR1302-2-201	 miRNA	+	137
ENSG00000237613.2	 FAM138A	chr1	34554	36081	 ENST00000417324.1	 FAM138A-201	 lncRNA	-	1527
ENSG00000237613.2	 FAM138A	chr1	35245	36073	 ENST00000461467.1	 FAM138A-202	 lncRNA	-	828

Obtaining softwares

$ cd /home/USER/tools

You can place the executables or make symbolic links in a location recognisable by$PATH or make changes to$PATH to avoid needing to specify full paths when using the programs. All codes provided assume executables can be found in $PATH.

* Latest stable version available at the time of writing

FastQC

https://www.bioinformatics.babraham.ac.uk/projects/fastqc; v0.11.9

Download and unzip the file. Change fastqc permission from 664 (-rw-rw-r--) to 755 (-rwxr-xr-x)

$ wget https://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.9.zip
$ unzip fastqc_v0.11.9.zip
$ cd FastQC
$ chmod 755 fastqc
$ ./fastqc --help

            FastQC - A high throughput sequence QC analysis tool

SYNOPSIS

	fastqc seqfile1 seqfile2 .. seqfileN

    fastqc [-o output dir] [--(no)extract] [-f fastq|bam|sam] 
           [-c contaminant file] seqfile1 .. seqfileN

DESCRIPTION

    FastQC reads a set of sequence files and produces from each one a quality
    control report consisting of a number of different modules, each one of 
    which will help to identify a different potential type of problem in your
    data.
    
    If no files to process are specified on the command line then the program
    will start as an interactive graphical application.  If files are provided
    on the command line then the program will run with no user interaction
    required.  In this mode it is suitable for inclusion into a standardised
    analysis pipeline.

BBTools

https://jgi.doe.gov/data-and-tools/bbtools; v38.82

$ wget -O BBMap_38.82.tar.gz 'https://downloads.sourceforge.net/project/bbmap/BBMap_38.82.tar.gz?r=https%3A%2F%2Fsourceforge.net%2Fprojects%2Fbbmap%2Ffiles%2FBBMap_38.82.tar.gz%2Fdownload&ts=1587572478'
$ tar xvf BBMap_38.82.tar.gz
$ cd bbmap
$ ./bbduk.sh

Written by Brian Bushnell
Last modified March 24, 2020

Description:  Compares reads to the kmers in a reference dataset, optionally 
allowing an edit distance. Splits the reads into two outputs - those that 
match the reference, and those that don't. Can also trim (remove) the matching 
parts of the reads rather than binning the reads.
Please read bbmap/docs/guides/BBDukGuide.txt for more information.

Usage:  bbduk.sh in=<input file> out=<output file> ref=<contaminant files>

Input may be stdin or a fasta or fastq file, compressed or uncompressed.
If you pipe via stdin/stdout, please include the file type; e.g. for gzipped 
fasta input, set in=stdin.fa.gz

Salmon

https://github.com/COMBINE-lab/salmon; v1.2.0

$ wget https://github.com/COMBINE-lab/salmon/releases/download/v1.2.1/salmon-1.2.1_linux_x86_64.tar.gz
$ tar xvf salmon-1.2.1_linux_x86_64.tar.gz
$ mv salmon-latest_linux_x86_64 salmon-1.2.1
$ cd salmon-1.2.1/bin
$ ./salmon 
salmon v1.2.1

Usage:  salmon -h|--help or 
        salmon -v|--version or 
        salmon -c|--cite or 
        salmon [--no-version-check] <COMMAND> [-h | options]

Commands:
     index Create a salmon index
     quant Quantify a sample
     alevin single cell analysis
     swim  Perform super-secret operation
     quantmerge Merge multiple quantifications into a single file

kallisto

https://github.com/pachterlab/kallisto; v0.46.2

Kallisto is phasing out HDF5 at the time of writing, binaries for this release (v0.46.2) are compiled with HDF5 built in

$ wget https://github.com/pachterlab/kallisto/releases/download/v0.46.2/kallisto_linux-v0.46.2.tar.gz
$ tar zxf kallisto_linux-v0.46.2.tar.gz
$ mv kallisto kallisto-v0.46.2
$ cd kallisto-v0.46.2
$ ./kallisto 
kallisto 0.46.2

Usage: kallisto <CMD> [arguments] ..

Where <CMD> can be one of:

    index         Builds a kallisto index 
    quant         Runs the quantification algorithm 
    bus           Generate BUS files for single-cell data 
    pseudo        Runs the pseudoalignment step 
    merge         Merges several batch runs 
    h5dump        Converts HDF5-formatted results to plaintext
    inspect       Inspects and gives information about an index
    version       Prints version information
    cite          Prints citation information

Running kallisto <CMD> without arguments prints usage information for <CMD>

STAR

https://github.com/alexdobin/STAR; v2.7.3a

Download and unpack the file. Use make to compile STAR

$ wget -O STAR-2.7.3a.tar.gz https://github.com/alexdobin/STAR/archive/2.7.3a.tar.gz
$ tar xvf STAR-2.7.3a.tar.gz
$ cd STAR-2.7.3a/source
$ make STAR
$ ./STAR
Usage: STAR  [options]... --genomeDir /path/to/genome/index/   --readFilesIn R1.fq R2.fq
Spliced Transcripts Alignment to a Reference (c) Alexander Dobin, 2009-2019

For more details see:
<https://github.com/alexdobin/STAR>
<https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf>

To list all parameters, run STAR --help

RSEM

https://github.com/deweylab/RSEM; v1.3.3

$ wget -O RSEM-1.3.3.tar.gz https://github.com/deweylab/RSEM/archive/v1.3.3.tar.gz
$ tar zxf RSEM-1.3.3.tar.gz
$ cd RSEM-1.3.3

By default, RSEM executables are installed to /usr/local/bin. You can change the installation location by setting DESTDIR and/or prefix variables. The RSEM executables will be installed to ${DESTDIR}${prefix}/bin. The default values of DESTDIR and prefix are DESTDIR= and prefix=/usr/local.

$ make
$ make install
$ rsem-calculate-expression
Invalid number of arguments!
NAME
    rsem-calculate-expression - Estimate gene and isoform expression from
    RNA-Seq data.

SYNOPSIS
     rsem-calculate-expression [options] upstream_read_file(s) reference_name sample_name 
     rsem-calculate-expression [options] --paired-end upstream_read_file(s) downstream_read_file(s) reference_name sample_name 
     rsem-calculate-expression [options] --alignments [--paired-end] input reference_name sample_name

Preparing indices

$ cd /home/USER/db/refanno

Salmon index

SAF genome index (recommended)

  • Full genome is used as decoy, largde index but least prone to spurious alignments

  • Indexing takes under 40 minutes running on 6 threads, and the index size is 17 GB

$ grep "^>" GRCh38.primary_assembly.genome.fa | cut -d " " -f 1 > GRCh38.decoys.txt
$ sed -i 's/>//g' GRCh38.decoys.txt
$ cat gencode.v33.transcripts.fa GRCh38.primary_assembly.genome.fa | gzip > GRCh38.gentrome.fa.gz

$ salmon index -p 6 --gencode -t GRCh38.gentrome.fa.gz \
-d GRCh38.decoys.txt -i gencode.v33_decoys_salmon-1.2.0

Alternatively, cDNA-only index

  • No decoy used, small index but most prone to possible spurious alignments

  • Indexing takes under 3 minutes running on 6 threads, and the index size is 782 MB

$ salmon index -p 6 --gencode -t gencode.v33.transcripts.fa \
-i gencode.v33_salmon-1.2.0

kallisto index

  • Use header-cleaned transcript file to create index

  • Indexing takes under 6 minutes running on a single thread, and the index size is 3.0G

$ kallisto index -i gencode.v33_kallisto-0.46.2 gencode.v33.transcripts.clean.fa

STAR index

The --sjdbOverhang option is set as 150 in this tutorial. This length should be equal to the ReadLength-1of the fastq files. The STAR manual suggests the ideal value is max(ReadLength)-1 in case of reads of varying length. In most cases, the default value of 100 will work as well as the ideal value.

Indexing takes about 70 minutes running on 6 threads, and the index size is 28 GB

$ mkdir gencode.v33_star-2.7.3a

$ STAR --runMode genomeGenerate --runThreadN 6 \
--genomeDir gencode.v33_star-2.7.3a \
--genomeFastaFiles GRCh38.primary_assembly.genome.fa \
--sjdbGTFfile gencode.v33.annotation.gtf --sjdbOverhang 150

RSEM index

Indexing takes under 2 minutes running on 6 threads, and the index size is 1.6G

$ rsem-prepare-reference -p 6 --gtf gencode.v33.annotation.gtf \
GRCh38.primary_assembly.genome.fa gencode.v33_rsem-1.3.3

Last updated