DGE analysis with Salmon/Kallisto input

Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided

Load required libraries

library(data.table)
library(GenomicFeatures)
library(tximport)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)

Import clinical data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

files = file.path(paste0(dir, "salmon"), list.files(paste0(dir, "salmon")), "quant.sf")
names(files) = list.files(paste0(dir, "salmon"))

Load TxDB

Load TxDB and construct a two column data.frame tx2gene with the transcript and gene identifiers

txdb.filename = "/home/USER/db/refanno/gencode.v33.annotation.sqlite"
txdb = loadDb(txdb.filename)

k = keys(txdb, keytype = "TXNAME")
tx2gene = select(txdb, k, "GENEID", "TXNAME")
> head(tx2gene)
             TXNAME            GENEID
1 ENST00000456328.2 ENSG00000223972.5
2 ENST00000450305.2 ENSG00000223972.5
3 ENST00000473358.1 ENSG00000243485.5
4 ENST00000469289.1 ENSG00000243485.5
5 ENST00000607096.1 ENSG00000284332.1
6 ENST00000606857.1 ENSG00000268020.3

Load expression data

Salmon

files = file.path(paste0(dir, "salmon"), list.files(paste0(dir, "salmon")), "quant.sf")
names(files) = list.files(paste0(dir, "salmon"))
> files
                        ERR2675454                         ERR2675455 
"../../salmon/ERR2675454/quant.sf" "../../salmon/ERR2675455/quant.sf" 
                        ERR2675458                         ERR2675459 
"../../salmon/ERR2675458/quant.sf" "../../salmon/ERR2675459/quant.sf" 
                        ERR2675460                         ERR2675461 
"../../salmon/ERR2675460/quant.sf" "../../salmon/ERR2675461/quant.sf" 
                        ERR2675464                         ERR2675465 
"../../salmon/ERR2675464/quant.sf" "../../salmon/ERR2675465/quant.sf" 
                        ERR2675468                         ERR2675469 
"../../salmon/ERR2675468/quant.sf" "../../salmon/ERR2675469/quant.sf" 
                        ERR2675472                         ERR2675473 
"../../salmon/ERR2675472/quant.sf" "../../salmon/ERR2675473/quant.sf" 
                        ERR2675476                         ERR2675477 
"../../salmon/ERR2675476/quant.sf" "../../salmon/ERR2675477/quant.sf" 
                        ERR2675478                         ERR2675479 
"../../salmon/ERR2675478/quant.sf" "../../salmon/ERR2675479/quant.sf" 
                        ERR2675480                         ERR2675481 
"../../salmon/ERR2675480/quant.sf" "../../salmon/ERR2675481/quant.sf" 
                        ERR2675484                         ERR2675485 
"../../salmon/ERR2675484/quant.sf" "../../salmon/ERR2675485/quant.sf"  

Import Salmon NumReads values with tximport which contains the transcript-level counts (txIn = TRUE), abundances and average transcript lengths, then output gene-level summarization (txOut = FALSE)

txi = tximport(files, type = "salmon", tx2gene = tx2gene, 
txIn = TRUE, txOut = FALSE, countsFromAbundance = "no")
> class(txi)
[1] "list"

> names(txi)
[1] "abundance"           "counts"             
[3] "length"              "countsFromAbundance"

Kallisto

files = file.path(paste0(dir, "kallisto"), list.files(paste0(dir, "kallisto")), "abundance.h5")
names(files) = list.files(paste0(dir, "kallisto"))
> files
                              ERR2675454                               ERR2675455 
"../../kallisto/ERR2675454/abundance.h5" "../../kallisto/ERR2675455/abundance.h5" 
                              ERR2675458                               ERR2675459 
"../../kallisto/ERR2675458/abundance.h5" "../../kallisto/ERR2675459/abundance.h5" 
                              ERR2675460                               ERR2675461 
"../../kallisto/ERR2675460/abundance.h5" "../../kallisto/ERR2675461/abundance.h5" 
                              ERR2675464                               ERR2675465 
"../../kallisto/ERR2675464/abundance.h5" "../../kallisto/ERR2675465/abundance.h5" 
                              ERR2675468                               ERR2675469 
"../../kallisto/ERR2675468/abundance.h5" "../../kallisto/ERR2675469/abundance.h5" 
                              ERR2675472                               ERR2675473 
"../../kallisto/ERR2675472/abundance.h5" "../../kallisto/ERR2675473/abundance.h5" 
                              ERR2675476                               ERR2675477 
"../../kallisto/ERR2675476/abundance.h5" "../../kallisto/ERR2675477/abundance.h5" 
                              ERR2675478                               ERR2675479 
"../../kallisto/ERR2675478/abundance.h5" "../../kallisto/ERR2675479/abundance.h5" 
                              ERR2675480                               ERR2675481 
"../../kallisto/ERR2675480/abundance.h5" "../../kallisto/ERR2675481/abundance.h5" 
                              ERR2675484                               ERR2675485 
"../../kallisto/ERR2675484/abundance.h5" "../../kallisto/ERR2675485/abundance.h5" 

Import Kallisto abundance.h5 files with tximport

txi = tximport(files, type = "kallisto", tx2gene = tx2gene, 
txIn = TRUE, txOut = FALSE, countsFromAbundance = "no")
> class(txi)
[1] "list"

> names(txi)
[1] "abundance"           "counts"             
[3] "infReps"             "length"             
[5] "countsFromAbundance"

Create DESeqDataSet object

Build a DESeqDataSet from txi, providing also the sample information and a design formula. The DESeqDataSetFromTximport function will take the txi$counts values and use the round function to round them to nearest integer as documented here. The design indicates how to model the samples, here, that we want to measure the difference in expression between pre-lesion and adjacent normal tissue in a paired-sample design

dds = DESeqDataSetFromTximport(txi, 
colData = sampleData, ~ individual + paris_classification)
> dds
class: DESeqDataSet 
dim: 60233 20 
metadata(1): version
assays(2): counts avgTxLength
rownames(60233): ENSG00000000003.15 ENSG00000000005.6 ... ENSG00000288587.1 ENSG00000288588.1
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status

Differential expression analysis

Run DESeq2 analysis using DESeq, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results function

dds = DESeq(dds)
> cbind(resultsNames(dds))
      [,1]                                  
 [1,] "Intercept"                           
 [2,] "individual_S11_vs_S1"                
 [3,] "individual_S12_vs_S1"                
 [4,] "individual_S15_vs_S1"                
 [5,] "individual_S17_vs_S1"                
 [6,] "individual_S3_vs_S1"                 
 [7,] "individual_S5_vs_S1"                 
 [8,] "individual_S6_vs_S1"                 
 [9,] "individual_S7_vs_S1"                 
[10,] "individual_S9_vs_S1"                 
[11,] "paris_classification_0.IIa_vs_normal"

We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha from 0.1 to 0.05

res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)

At a FDR of 0.05, we have 3274 genes up-regulated and 3342 genes down-regulated in 0-IIa pre-lesion versus Normal

> summary(res)

out of 37313 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 3274, 8.8%
LFC < 0 (down)     : 3342, 9%
outliers [1]       : 0, 0%
low counts [2]     : 15400, 41%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

The results res object contains the follow columns

> mcols(res)$description
[1] "mean of normalized counts for all samples"                   
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"        
[4] "Wald statistic: paris classification 0.IIa vs normal"        
[5] "Wald test p-value: paris classification 0.IIa vs normal"     
[6] "BH adjusted p-values"                                        

> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal 
Wald test p-value: paris classification 0.IIa vs normal 
DataFrame with 6 rows and 6 columns
                           baseMean     log2FoldChange             lfcSE               stat               pvalue                padj
                          <numeric>          <numeric>         <numeric>          <numeric>            <numeric>           <numeric>
ENSG00000000003.15 1546.35486175611 -0.581311619578096 0.174441213818185  -3.33242131749887 0.000860938112156993 0.00453612331129988
ENSG00000000005.6  33.9170595345844 -0.127395801668489  0.36047271289215 -0.353413163083456    0.723778719010441   0.832554491846499
ENSG00000000419.12 810.269365079278 0.0330353444973176 0.164082074011246  0.201334269428197    0.840437208028287   0.909277206454224
ENSG00000000457.14 433.817917089428 -0.206547272035916  0.12218101991675  -1.69050211052953   0.0909319348233635   0.201842735796633
ENSG00000000460.17 134.776100984046  0.154247180040075 0.221278370791785  0.697073010290807    0.485757103825257    0.65367203488841
ENSG00000000938.13 53.6431744642057 0.0891342530564219 0.288015018855273  0.309477795327096    0.756958100440997   0.855339071823214

Read about authors' note on p-values and adjusted p-values set to NA here

Exploring results

MA-plot

We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down

resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal", 
type="apeglm")

png("DGE_MA-plot.Salmon.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6), 
main = "MA-plot for the shrunken log2 fold changes")
dev.off()

Principal component plot of the samples

First, we perform count data transformation with regularized logarithm rlog or variance stabilizing transformations vst. You can read here about which transformation to choose

rld = rlog(dds)
vsd = vst(dds)

Perform sample PCA for transformed data using plotPCA, then plot with ggplot

# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DGE_PCA-rlog.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) + theme_bw() + 
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DGE_PCA-vst.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) +theme_bw() + scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()

Volcano plots

We use EnhancedVolcano to generate volcano plot to visualise the results of differential expression analyses

pCutoff = 0.05
FCcutoff = 1.0

p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)

png("DGE_VolcanoPlots.Salmon.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()

Exporting results

We construct a normData data.frame to store per-group normalised mean and normalised counts of all samples, and a deData data.frame to store the DESeq results. We merge both data.frame with a annoData data.frame that contain per-gene annotation derived from GENCODE GTF file before exporting the results in tabular format

annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))

normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification), 
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))

normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'GeneID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DGE_DESeq2_Means_and_NormalisedCount.Salmon.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)

write.table(deData, file="DGE_DESeq2_DE_results.Salmon.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)
> normData[10000:10005,1:12]
                  GeneID GeneSymbol Chromosome    Start      End                Class Strand Length      normal        0-IIa ERR2675454  ERR2675455
20530  ENSG00000206816.1 AP000609.1      chr11 77691650 77691751             misc_RNA      +    101   0.0000000   0.00000000   0.000000   0.0000000
42246  ENSG00000254985.1   RSF1-IT2      chr11 77717712 77718411               lncRNA      -    699   0.4097166   0.07330207   0.000000   2.9207420
27254  ENSG00000227376.1    FTH1P16      chr11 77734475 77735026 processed_pseudogene      -    551   2.3712307   2.06310048   6.536917   0.9725242
42622  ENSG00000255409.1   RSF1-IT1      chr11 77738680 77739568               lncRNA      -    888   0.4259810   0.17161175   0.000000   0.9826625
23603  ENSG00000219529.2 AP000580.1      chr11 77813319 77813676 processed_pseudogene      -    357  10.6673039   8.50670790  14.406289   6.7761005
1755  ENSG00000087884.14      AAMDC      chr11 77821109 77918432       protein_coding      +  97323 301.8521056 189.48919021 224.626498 349.5300907

> deData[10000:10005,]
                  GeneID GeneSymbol Chromosome    Start      End                Class Strand Length    baseMean      log2fc       pvalue         padj
20530  ENSG00000206816.1 AP000609.1      chr11 77691650 77691751             misc_RNA      +    101   0.0000000          NA           NA           NA
42246  ENSG00000254985.1   RSF1-IT2      chr11 77717712 77718411               lncRNA      -    699   0.2415093 -0.62072985 8.366352e-01           NA
27254  ENSG00000227376.1    FTH1P16      chr11 77734475 77735026 processed_pseudogene      -    551   2.2171656 -0.02912317 9.758744e-01 9.860894e-01
42622  ENSG00000255409.1   RSF1-IT1      chr11 77738680 77739568               lncRNA      -    888   0.2987963 -0.36554563 9.035198e-01           NA
23603  ENSG00000219529.2 AP000580.1      chr11 77813319 77813676 processed_pseudogene      -    357   9.5870059 -0.05559022 9.396577e-01 9.673816e-01
1755  ENSG00000087884.14      AAMDC      chr11 77821109 77918432       protein_coding      +  97323 245.6706479 -0.68973212 6.942611e-08 9.677700e-07

Session info

> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] EnhancedVolcano_1.4.0       ggrepel_0.8.2              
 [3] ggplot2_3.3.0               apeglm_1.8.0               
 [5] DESeq2_1.26.0               SummarizedExperiment_1.16.1
 [7] DelayedArray_0.12.3         BiocParallel_1.20.1        
 [9] matrixStats_0.56.0          tximport_1.14.2            
[11] GenomicFeatures_1.38.2      AnnotationDbi_1.48.0       
[13] Biobase_2.46.0              GenomicRanges_1.38.0       
[15] GenomeInfoDb_1.22.1         IRanges_2.20.2             
[17] S4Vectors_0.24.4            BiocGenerics_0.32.0        
[19] data.table_1.12.8          

loaded via a namespace (and not attached):
 [1] bitops_1.0-6             bit64_0.9-7              RColorBrewer_1.1-2      
 [4] progress_1.2.2           httr_1.4.1               numDeriv_2016.8-1.1     
 [7] tools_3.6.2              backports_1.1.6          R6_2.4.1                
[10] rpart_4.1-15             Hmisc_4.4-0              DBI_1.1.0               
[13] colorspace_1.4-1         nnet_7.3-14              withr_2.2.0             
[16] tidyselect_1.0.0         gridExtra_2.3            prettyunits_1.1.1       
[19] bit_1.1-15.2             curl_4.3                 compiler_3.6.2          
[22] htmlTable_1.13.3         labeling_0.3             rtracklayer_1.46.0      
[25] scales_1.1.0             checkmate_2.0.0          mvtnorm_1.1-0           
[28] readr_1.3.1              genefilter_1.68.0        askpass_1.1             
[31] rappdirs_0.3.1           stringr_1.4.0            digest_0.6.25           
[34] Rsamtools_2.2.3          foreign_0.8-76           XVector_0.26.0          
[37] base64enc_0.1-3          jpeg_0.1-8.1             pkgconfig_2.0.3         
[40] htmltools_0.4.0          bbmle_1.0.23.1           dbplyr_1.4.3            
[43] htmlwidgets_1.5.1        rlang_0.4.6              rstudioapi_0.11         
[46] RSQLite_2.2.0            farver_2.0.3             jsonlite_1.6.1          
[49] acepack_1.4.1            dplyr_0.8.5              RCurl_1.98-1.2          
[52] magrittr_1.5             GenomeInfoDbData_1.2.2   Formula_1.2-3           
[55] Matrix_1.2-18            Rcpp_1.0.4.6             munsell_0.5.0           
[58] lifecycle_0.2.0          stringi_1.4.6            MASS_7.3-51.6           
[61] zlibbioc_1.32.0          plyr_1.8.6               BiocFileCache_1.10.2    
[64] grid_3.6.2               blob_1.2.1               bdsmatrix_1.3-4         
[67] crayon_1.3.4             lattice_0.20-41          Biostrings_2.54.0       
[70] splines_3.6.2            annotate_1.64.0          hms_0.5.3               
[73] locfit_1.5-9.4           knitr_1.28               pillar_1.4.4            
[76] geneplotter_1.64.0       biomaRt_2.42.1           XML_3.99-0.3            
[79] glue_1.4.0               latticeExtra_0.6-29      png_0.1-7               
[82] vctrs_0.2.4              gtable_0.3.0             openssl_1.4.1           
[85] purrr_0.3.4              assertthat_0.2.1         emdbook_1.3.12          
[88] xfun_0.13                xtable_1.8-4             coda_0.19-3             
[91] survival_3.1-12          tibble_3.0.1             GenomicAlignments_1.22.1
[94] memoise_1.1.0            cluster_2.1.0            ellipsis_0.3.0          

Last updated