DTE analysis with Salmon/Kallisto input
Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided
Load required libraries
library(data.table)
library(tximport)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)
Import clinical data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
Load expression data
Salmon
files = file.path(paste0(dir, "salmon"), list.files(paste0(dir, "salmon")), "quant.sf")
names(files) = list.files(paste0(dir, "salmon"))
> files
ERR2675454 ERR2675455
"../../salmon/ERR2675454/quant.sf" "../../salmon/ERR2675455/quant.sf"
ERR2675458 ERR2675459
"../../salmon/ERR2675458/quant.sf" "../../salmon/ERR2675459/quant.sf"
ERR2675460 ERR2675461
"../../salmon/ERR2675460/quant.sf" "../../salmon/ERR2675461/quant.sf"
ERR2675464 ERR2675465
"../../salmon/ERR2675464/quant.sf" "../../salmon/ERR2675465/quant.sf"
ERR2675468 ERR2675469
"../../salmon/ERR2675468/quant.sf" "../../salmon/ERR2675469/quant.sf"
ERR2675472 ERR2675473
"../../salmon/ERR2675472/quant.sf" "../../salmon/ERR2675473/quant.sf"
ERR2675476 ERR2675477
"../../salmon/ERR2675476/quant.sf" "../../salmon/ERR2675477/quant.sf"
ERR2675478 ERR2675479
"../../salmon/ERR2675478/quant.sf" "../../salmon/ERR2675479/quant.sf"
ERR2675480 ERR2675481
"../../salmon/ERR2675480/quant.sf" "../../salmon/ERR2675481/quant.sf"
ERR2675484 ERR2675485
"../../salmon/ERR2675484/quant.sf" "../../salmon/ERR2675485/quant.sf"
Import Salmon NumReads
values with tximport
which contains the transcript-level counts (txIn = TRUE
), abundances and average transcript lengths, then output transcript-level estimates (txOut = TRUE
)
txi = tximport(files, type = "salmon",
txIn = TRUE, txOut = TRUE, countsFromAbundance = "no")
> class(txi)
[1] "list"
> names(txi)
[1] "abundance" "counts"
[3] "length" "countsFromAbundance"
Kallisto
files = file.path(paste0(dir, "kallisto"), list.files(paste0(dir, "kallisto")), "abundance.h5")
names(files) = list.files(paste0(dir, "kallisto"))
> files
ERR2675454 ERR2675455
"../../kallisto/ERR2675454/abundance.h5" "../../kallisto/ERR2675455/abundance.h5"
ERR2675458 ERR2675459
"../../kallisto/ERR2675458/abundance.h5" "../../kallisto/ERR2675459/abundance.h5"
ERR2675460 ERR2675461
"../../kallisto/ERR2675460/abundance.h5" "../../kallisto/ERR2675461/abundance.h5"
ERR2675464 ERR2675465
"../../kallisto/ERR2675464/abundance.h5" "../../kallisto/ERR2675465/abundance.h5"
ERR2675468 ERR2675469
"../../kallisto/ERR2675468/abundance.h5" "../../kallisto/ERR2675469/abundance.h5"
ERR2675472 ERR2675473
"../../kallisto/ERR2675472/abundance.h5" "../../kallisto/ERR2675473/abundance.h5"
ERR2675476 ERR2675477
"../../kallisto/ERR2675476/abundance.h5" "../../kallisto/ERR2675477/abundance.h5"
ERR2675478 ERR2675479
"../../kallisto/ERR2675478/abundance.h5" "../../kallisto/ERR2675479/abundance.h5"
ERR2675480 ERR2675481
"../../kallisto/ERR2675480/abundance.h5" "../../kallisto/ERR2675481/abundance.h5"
ERR2675484 ERR2675485
"../../kallisto/ERR2675484/abundance.h5" "../../kallisto/ERR2675485/abundance.h5"
Import Kallisto abundance.h5
files with tximport
txi = tximport(files, type = "kallisto",
txIn = TRUE, txOut = TRUE, countsFromAbundance = "no")
> class(txi)
[1] "list"
> names(txi)
[1] "abundance" "counts"
[3] "infReps" "length"
[5] "countsFromAbundance"
Create DESeqDataSet object
dds = DESeqDataSetFromTximport(txi,
colData = sampleData, ~ individual + paris_classification)
> dds
class: DESeqDataSet
dim: 227063 20
metadata(1): version
assays(2): counts avgTxLength
rownames(227063): ENST00000456328.2 ENST00000450305.2 ... ENST00000387460.2
ENST00000387461.2
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status
Differential expression analysis
Run DESeq2 analysis using DESeq
, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results
function
dds = DESeq(dds)
> cbind(resultsNames(dds))
[,1]
[1,] "Intercept"
[2,] "individual_S11_vs_S1"
[3,] "individual_S12_vs_S1"
[4,] "individual_S15_vs_S1"
[5,] "individual_S17_vs_S1"
[6,] "individual_S3_vs_S1"
[7,] "individual_S5_vs_S1"
[8,] "individual_S6_vs_S1"
[9,] "individual_S7_vs_S1"
[10,] "individual_S9_vs_S1"
[11,] "paris_classification_0.IIa_vs_normal"
We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha
from 0.1 to 0.05
res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)
At a FDR of 0.05, we have 3274 genes up-regulated and 3342 genes down-regulated in 0-IIa pre-lesion versus Normal
> summary(res)
out of 159033 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 5020, 3.2%
LFC < 0 (down) : 3728, 2.3%
outliers [1] : 0, 0%
low counts [2] : 87402, 55%
(mean count < 3)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
The results res
object contains the follow columns
> mcols(res)$description
[1] "mean of normalized counts for all samples"
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"
[4] "Wald statistic: paris classification 0.IIa vs normal"
[5] "Wald test p-value: paris classification 0.IIa vs normal"
[6] "BH adjusted p-values"
> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal
Wald test p-value: paris classification 0.IIa vs normal
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENST00000456328.2 0.399730016527216 -0.199743190436256 3.01346565486448 -0.0662835463592627 0.947152082517689 NA
ENST00000450305.2 0 NA NA NA NA NA
ENST00000488147.1 212.684877084654 -0.339571171837121 0.174521709884518 -1.94572452941137 0.0516878384949591 0.229396007387386
ENST00000619216.1 0 NA NA NA NA NA
ENST00000473358.1 0 NA NA NA NA NA
ENST00000469289.1 0.0480413177220545 -0.493640026489245 3.02557680193897 -0.163155675365072 0.870395864425051 NA
Exploring results
MA-plot
We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down
resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal",
type="apeglm")
png("DTE_MA-plot.Salmon.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6),
main = "MA-plot for the shrunken log2 fold changes")
dev.off()
Principal component plot of the samples
rld = rlog(dds)
vsd = vst(dds)
Perform sample PCA for transformed data using plotPCA
, then plot with ggplot
# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DTE_PCA-rlog.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) + theme_bw() +
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DTE_PCA-vst.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) +theme_bw() + scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
Volcano plots
We use EnhancedVolcano
to generate volcano plot to visualise the results of differential expression analyses
pCutoff = 0.05
FCcutoff = 1.0
p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)
png("DTE_VolcanoPlots.Salmon.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()
Exporting results
We construct a normData
data.frame to store per-group normalised mean and normalised counts of all samples, and a deData
data.frame to store the DESeq results. We merge both data.frame with a annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format
annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))
normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification),
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))
normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'TranscriptID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'TranscriptID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DTE_DESeq2_Means_and_NormalisedCount.Salmon.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DTE_DESeq2_DE_results.Salmon.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
> normData[normData$GeneSymbol == "ZIC2", 1:14]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length normal 0-IIa ERR2675454 ERR2675455
25392 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 2.28107673 220.7244957 379.175969 0
93049 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 0.00000000 0.1029024 1.029024 0
75590 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 0.39991361 0.0000000 0.000000 0
82745 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 0.09894979 1.2127449 0.000000 0
86234 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 0.00000000 0.0000000 0.000000 0
> deData[deData$GeneSymbol == "ZIC2",]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length baseMean log2fc pvalue padj
25392 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 111.50278623 6.79764818 1.242670e-33 1.534719e-30
93049 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 0.05145118 0.08858668 9.766466e-01 NA
75590 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 0.19995681 -0.62032672 8.371503e-01 NA
82745 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 0.65584733 1.65567410 5.805272e-01 NA
86234 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 0.00000000 NA NA NA
Session info
> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] EnhancedVolcano_1.4.0 ggrepel_0.8.2
[3] ggplot2_3.3.0 apeglm_1.8.0
[5] DESeq2_1.26.0 SummarizedExperiment_1.16.1
[7] DelayedArray_0.12.3 BiocParallel_1.20.1
[9] matrixStats_0.56.0 Biobase_2.46.0
[11] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
[13] IRanges_2.20.2 S4Vectors_0.24.4
[15] BiocGenerics_0.32.0 tximport_1.14.2
[17] data.table_1.12.8
loaded via a namespace (and not attached):
[1] bitops_1.0-6 bit64_0.9-7 RColorBrewer_1.1-2
[4] numDeriv_2016.8-1.1 tools_3.6.2 backports_1.1.6
[7] R6_2.4.1 rpart_4.1-15 Hmisc_4.4-0
[10] DBI_1.1.0 colorspace_1.4-1 nnet_7.3-14
[13] withr_2.2.0 tidyselect_1.0.0 gridExtra_2.3
[16] bit_1.1-15.2 compiler_3.6.2 htmlTable_1.13.3
[19] labeling_0.3 scales_1.1.0 checkmate_2.0.0
[22] mvtnorm_1.1-0 readr_1.3.1 genefilter_1.68.0
[25] stringr_1.4.0 digest_0.6.25 foreign_0.8-76
[28] XVector_0.26.0 base64enc_0.1-3 jpeg_0.1-8.1
[31] pkgconfig_2.0.3 htmltools_0.4.0 bbmle_1.0.23.1
[34] htmlwidgets_1.5.1 rlang_0.4.6 rstudioapi_0.11
[37] RSQLite_2.2.0 farver_2.0.3 jsonlite_1.6.1
[40] acepack_1.4.1 dplyr_0.8.5 RCurl_1.98-1.2
[43] magrittr_1.5 GenomeInfoDbData_1.2.2 Formula_1.2-3
[46] Matrix_1.2-18 Rcpp_1.0.4.6 munsell_0.5.0
[49] lifecycle_0.2.0 stringi_1.4.6 MASS_7.3-51.6
[52] zlibbioc_1.32.0 plyr_1.8.6 grid_3.6.2
[55] blob_1.2.1 bdsmatrix_1.3-4 crayon_1.3.4
[58] lattice_0.20-41 splines_3.6.2 annotate_1.64.0
[61] hms_0.5.3 locfit_1.5-9.4 knitr_1.28
[64] pillar_1.4.4 geneplotter_1.64.0 XML_3.99-0.3
[67] glue_1.4.0 latticeExtra_0.6-29 png_0.1-7
[70] vctrs_0.2.4 gtable_0.3.0 purrr_0.3.4
[73] assertthat_0.2.1 emdbook_1.3.12 xfun_0.13
[76] xtable_1.8-4 coda_0.19-3 survival_3.1-12
[79] tibble_3.0.1 AnnotationDbi_1.48.0 memoise_1.1.0
[82] cluster_2.1.0 ellipsis_0.3.0
Last updated
Was this helpful?