DTE analysis with Salmon/Kallisto input

Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided

Load required libraries

library(data.table)
library(tximport)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)

Import clinical data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

Load expression data

Salmon

files = file.path(paste0(dir, "salmon"), list.files(paste0(dir, "salmon")), "quant.sf")
names(files) = list.files(paste0(dir, "salmon"))
> files
                        ERR2675454                         ERR2675455 
"../../salmon/ERR2675454/quant.sf" "../../salmon/ERR2675455/quant.sf" 
                        ERR2675458                         ERR2675459 
"../../salmon/ERR2675458/quant.sf" "../../salmon/ERR2675459/quant.sf" 
                        ERR2675460                         ERR2675461 
"../../salmon/ERR2675460/quant.sf" "../../salmon/ERR2675461/quant.sf" 
                        ERR2675464                         ERR2675465 
"../../salmon/ERR2675464/quant.sf" "../../salmon/ERR2675465/quant.sf" 
                        ERR2675468                         ERR2675469 
"../../salmon/ERR2675468/quant.sf" "../../salmon/ERR2675469/quant.sf" 
                        ERR2675472                         ERR2675473 
"../../salmon/ERR2675472/quant.sf" "../../salmon/ERR2675473/quant.sf" 
                        ERR2675476                         ERR2675477 
"../../salmon/ERR2675476/quant.sf" "../../salmon/ERR2675477/quant.sf" 
                        ERR2675478                         ERR2675479 
"../../salmon/ERR2675478/quant.sf" "../../salmon/ERR2675479/quant.sf" 
                        ERR2675480                         ERR2675481 
"../../salmon/ERR2675480/quant.sf" "../../salmon/ERR2675481/quant.sf" 
                        ERR2675484                         ERR2675485 
"../../salmon/ERR2675484/quant.sf" "../../salmon/ERR2675485/quant.sf"   

Import Salmon NumReads values with tximport which contains the transcript-level counts (txIn = TRUE), abundances and average transcript lengths, then output transcript-level estimates (txOut = TRUE)

txi = tximport(files, type = "salmon", 
txIn = TRUE, txOut = TRUE, countsFromAbundance = "no")
> class(txi)
[1] "list"

> names(txi)
[1] "abundance"           "counts"             
[3] "length"              "countsFromAbundance"

Kallisto

files = file.path(paste0(dir, "kallisto"), list.files(paste0(dir, "kallisto")), "abundance.h5")
names(files) = list.files(paste0(dir, "kallisto"))
> files
                              ERR2675454                               ERR2675455 
"../../kallisto/ERR2675454/abundance.h5" "../../kallisto/ERR2675455/abundance.h5" 
                              ERR2675458                               ERR2675459 
"../../kallisto/ERR2675458/abundance.h5" "../../kallisto/ERR2675459/abundance.h5" 
                              ERR2675460                               ERR2675461 
"../../kallisto/ERR2675460/abundance.h5" "../../kallisto/ERR2675461/abundance.h5" 
                              ERR2675464                               ERR2675465 
"../../kallisto/ERR2675464/abundance.h5" "../../kallisto/ERR2675465/abundance.h5" 
                              ERR2675468                               ERR2675469 
"../../kallisto/ERR2675468/abundance.h5" "../../kallisto/ERR2675469/abundance.h5" 
                              ERR2675472                               ERR2675473 
"../../kallisto/ERR2675472/abundance.h5" "../../kallisto/ERR2675473/abundance.h5" 
                              ERR2675476                               ERR2675477 
"../../kallisto/ERR2675476/abundance.h5" "../../kallisto/ERR2675477/abundance.h5" 
                              ERR2675478                               ERR2675479 
"../../kallisto/ERR2675478/abundance.h5" "../../kallisto/ERR2675479/abundance.h5" 
                              ERR2675480                               ERR2675481 
"../../kallisto/ERR2675480/abundance.h5" "../../kallisto/ERR2675481/abundance.h5" 
                              ERR2675484                               ERR2675485 
"../../kallisto/ERR2675484/abundance.h5" "../../kallisto/ERR2675485/abundance.h5"  

Import Kallisto abundance.h5 files with tximport

txi = tximport(files, type = "kallisto", 
txIn = TRUE, txOut = TRUE, countsFromAbundance = "no")
> class(txi)
[1] "list"

> names(txi)
[1] "abundance"           "counts"             
[3] "infReps"             "length"             
[5] "countsFromAbundance"

Create DESeqDataSet object

Build a DESeqDataSet from txi, providing also the sample information and a design formula. The DESeqDataSetFromTximport function will take the txi$counts values and use the round function to round them to nearest integer as documented here. The design indicates how to model the samples, here, that we want to measure the difference in expression between pre-lesion and adjacent normal tissue in a paired-sample design

dds = DESeqDataSetFromTximport(txi, 
colData = sampleData, ~ individual + paris_classification)
> dds
class: DESeqDataSet 
dim: 227063 20 
metadata(1): version
assays(2): counts avgTxLength
rownames(227063): ENST00000456328.2 ENST00000450305.2 ... ENST00000387460.2
  ENST00000387461.2
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status

Differential expression analysis

Run DESeq2 analysis using DESeq, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results function

dds = DESeq(dds)
> cbind(resultsNames(dds))
      [,1]                                  
 [1,] "Intercept"                           
 [2,] "individual_S11_vs_S1"                
 [3,] "individual_S12_vs_S1"                
 [4,] "individual_S15_vs_S1"                
 [5,] "individual_S17_vs_S1"                
 [6,] "individual_S3_vs_S1"                 
 [7,] "individual_S5_vs_S1"                 
 [8,] "individual_S6_vs_S1"                 
 [9,] "individual_S7_vs_S1"                 
[10,] "individual_S9_vs_S1"                 
[11,] "paris_classification_0.IIa_vs_normal"

We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha from 0.1 to 0.05

res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)

At a FDR of 0.05, we have 3274 genes up-regulated and 3342 genes down-regulated in 0-IIa pre-lesion versus Normal

> summary(res)

out of 159033 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 5020, 3.2%
LFC < 0 (down)     : 3728, 2.3%
outliers [1]       : 0, 0%
low counts [2]     : 87402, 55%
(mean count < 3)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

The results res object contains the follow columns

> mcols(res)$description
[1] "mean of normalized counts for all samples"                   
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"        
[4] "Wald statistic: paris classification 0.IIa vs normal"        
[5] "Wald test p-value: paris classification 0.IIa vs normal"     
[6] "BH adjusted p-values"                                        

> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal 
Wald test p-value: paris classification 0.IIa vs normal 
DataFrame with 6 rows and 6 columns
                            baseMean     log2FoldChange             lfcSE                stat             pvalue              padj
                           <numeric>          <numeric>         <numeric>           <numeric>          <numeric>         <numeric>
ENST00000456328.2  0.399730016527216 -0.199743190436256  3.01346565486448 -0.0662835463592627  0.947152082517689                NA
ENST00000450305.2                  0                 NA                NA                  NA                 NA                NA
ENST00000488147.1   212.684877084654 -0.339571171837121 0.174521709884518   -1.94572452941137 0.0516878384949591 0.229396007387386
ENST00000619216.1                  0                 NA                NA                  NA                 NA                NA
ENST00000473358.1                  0                 NA                NA                  NA                 NA                NA
ENST00000469289.1 0.0480413177220545 -0.493640026489245  3.02557680193897  -0.163155675365072  0.870395864425051                NA

Read about authors' note on p-values and adjusted p-values set to NA here

Exploring results

MA-plot

We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down

resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal", 
type="apeglm")

png("DTE_MA-plot.Salmon.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6), 
main = "MA-plot for the shrunken log2 fold changes")
dev.off()

Principal component plot of the samples

First, we perform count data transformation with regularized logarithm rlog or variance stabilizing transformations vst. You can read here about which transformation to choose

rld = rlog(dds)
vsd = vst(dds)

Perform sample PCA for transformed data using plotPCA, then plot with ggplot

# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DTE_PCA-rlog.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) + theme_bw() + 
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DTE_PCA-vst.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) +theme_bw() + scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()

Volcano plots

We use EnhancedVolcano to generate volcano plot to visualise the results of differential expression analyses

pCutoff = 0.05
FCcutoff = 1.0

p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)

png("DTE_VolcanoPlots.Salmon.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()

Exporting results

We construct a normData data.frame to store per-group normalised mean and normalised counts of all samples, and a deData data.frame to store the DESeq results. We merge both data.frame with a annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format

annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))

normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification), 
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))

normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'TranscriptID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'TranscriptID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DTE_DESeq2_Means_and_NormalisedCount.Salmon.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DTE_DESeq2_DE_results.Salmon.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)
> normData[normData$GeneSymbol == "ZIC2", 1:14]
           TranscriptID             GeneID GeneSymbol Chromosome    Start      End TranscriptName                Class Strand Length     normal       0-IIa ERR2675454 ERR2675455
25392 ENST00000376335.8 ENSG00000043355.12       ZIC2      chr13 99981784 99986765       ZIC2-201       protein_coding      +   4981 2.28107673 220.7244957 379.175969          0
93049 ENST00000490085.5 ENSG00000043355.12       ZIC2      chr13 99984289 99985492       ZIC2-205 processed_transcript      +   1203 0.00000000   0.1029024   1.029024          0
75590 ENST00000468291.1 ENSG00000043355.12       ZIC2      chr13 99984689 99985492       ZIC2-202 processed_transcript      +    803 0.39991361   0.0000000   0.000000          0
82745 ENST00000477213.1 ENSG00000043355.12       ZIC2      chr13 99984789 99985474       ZIC2-203 processed_transcript      +    685 0.09894979   1.2127449   0.000000          0
86234 ENST00000481565.1 ENSG00000043355.12       ZIC2      chr13 99985494 99985904       ZIC2-204 processed_transcript      +    410 0.00000000   0.0000000   0.000000          0

> deData[deData$GeneSymbol == "ZIC2",]
           TranscriptID             GeneID GeneSymbol Chromosome    Start      End TranscriptName                Class Strand Length     baseMean      log2fc       pvalue         padj
25392 ENST00000376335.8 ENSG00000043355.12       ZIC2      chr13 99981784 99986765       ZIC2-201       protein_coding      +   4981 111.50278623  6.79764818 1.242670e-33 1.534719e-30
93049 ENST00000490085.5 ENSG00000043355.12       ZIC2      chr13 99984289 99985492       ZIC2-205 processed_transcript      +   1203   0.05145118  0.08858668 9.766466e-01           NA
75590 ENST00000468291.1 ENSG00000043355.12       ZIC2      chr13 99984689 99985492       ZIC2-202 processed_transcript      +    803   0.19995681 -0.62032672 8.371503e-01           NA
82745 ENST00000477213.1 ENSG00000043355.12       ZIC2      chr13 99984789 99985474       ZIC2-203 processed_transcript      +    685   0.65584733  1.65567410 5.805272e-01           NA
86234 ENST00000481565.1 ENSG00000043355.12       ZIC2      chr13 99985494 99985904       ZIC2-204 processed_transcript      +    410   0.00000000          NA           NA           NA

Session info

> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] EnhancedVolcano_1.4.0       ggrepel_0.8.2              
 [3] ggplot2_3.3.0               apeglm_1.8.0               
 [5] DESeq2_1.26.0               SummarizedExperiment_1.16.1
 [7] DelayedArray_0.12.3         BiocParallel_1.20.1        
 [9] matrixStats_0.56.0          Biobase_2.46.0             
[11] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
[13] IRanges_2.20.2              S4Vectors_0.24.4           
[15] BiocGenerics_0.32.0         tximport_1.14.2            
[17] data.table_1.12.8          

loaded via a namespace (and not attached):
 [1] bitops_1.0-6           bit64_0.9-7            RColorBrewer_1.1-2    
 [4] numDeriv_2016.8-1.1    tools_3.6.2            backports_1.1.6       
 [7] R6_2.4.1               rpart_4.1-15           Hmisc_4.4-0           
[10] DBI_1.1.0              colorspace_1.4-1       nnet_7.3-14           
[13] withr_2.2.0            tidyselect_1.0.0       gridExtra_2.3         
[16] bit_1.1-15.2           compiler_3.6.2         htmlTable_1.13.3      
[19] labeling_0.3           scales_1.1.0           checkmate_2.0.0       
[22] mvtnorm_1.1-0          readr_1.3.1            genefilter_1.68.0     
[25] stringr_1.4.0          digest_0.6.25          foreign_0.8-76        
[28] XVector_0.26.0         base64enc_0.1-3        jpeg_0.1-8.1          
[31] pkgconfig_2.0.3        htmltools_0.4.0        bbmle_1.0.23.1        
[34] htmlwidgets_1.5.1      rlang_0.4.6            rstudioapi_0.11       
[37] RSQLite_2.2.0          farver_2.0.3           jsonlite_1.6.1        
[40] acepack_1.4.1          dplyr_0.8.5            RCurl_1.98-1.2        
[43] magrittr_1.5           GenomeInfoDbData_1.2.2 Formula_1.2-3         
[46] Matrix_1.2-18          Rcpp_1.0.4.6           munsell_0.5.0         
[49] lifecycle_0.2.0        stringi_1.4.6          MASS_7.3-51.6         
[52] zlibbioc_1.32.0        plyr_1.8.6             grid_3.6.2            
[55] blob_1.2.1             bdsmatrix_1.3-4        crayon_1.3.4          
[58] lattice_0.20-41        splines_3.6.2          annotate_1.64.0       
[61] hms_0.5.3              locfit_1.5-9.4         knitr_1.28            
[64] pillar_1.4.4           geneplotter_1.64.0     XML_3.99-0.3          
[67] glue_1.4.0             latticeExtra_0.6-29    png_0.1-7             
[70] vctrs_0.2.4            gtable_0.3.0           purrr_0.3.4           
[73] assertthat_0.2.1       emdbook_1.3.12         xfun_0.13             
[76] xtable_1.8-4           coda_0.19-3            survival_3.1-12       
[79] tibble_3.0.1           AnnotationDbi_1.48.0   memoise_1.1.0         
[82] cluster_2.1.0          ellipsis_0.3.0        

Last updated