DGE analysis with Salmon/Kallisto input
Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided
Load required libraries
library(data.table)
library(GenomicFeatures)
library(tximport)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)
Import clinical data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
files = file.path(paste0(dir, "salmon"), list.files(paste0(dir, "salmon")), "quant.sf")
names(files) = list.files(paste0(dir, "salmon"))
Load TxDB
Load TxDB and construct a two column data.frame tx2gene
with the transcript and gene identifiers
txdb.filename = "/home/USER/db/refanno/gencode.v33.annotation.sqlite"
txdb = loadDb(txdb.filename)
k = keys(txdb, keytype = "TXNAME")
tx2gene = select(txdb, k, "GENEID", "TXNAME")
> head(tx2gene)
TXNAME GENEID
1 ENST00000456328.2 ENSG00000223972.5
2 ENST00000450305.2 ENSG00000223972.5
3 ENST00000473358.1 ENSG00000243485.5
4 ENST00000469289.1 ENSG00000243485.5
5 ENST00000607096.1 ENSG00000284332.1
6 ENST00000606857.1 ENSG00000268020.3
Load expression data
Salmon
files = file.path(paste0(dir, "salmon"), list.files(paste0(dir, "salmon")), "quant.sf")
names(files) = list.files(paste0(dir, "salmon"))
> files
ERR2675454 ERR2675455
"../../salmon/ERR2675454/quant.sf" "../../salmon/ERR2675455/quant.sf"
ERR2675458 ERR2675459
"../../salmon/ERR2675458/quant.sf" "../../salmon/ERR2675459/quant.sf"
ERR2675460 ERR2675461
"../../salmon/ERR2675460/quant.sf" "../../salmon/ERR2675461/quant.sf"
ERR2675464 ERR2675465
"../../salmon/ERR2675464/quant.sf" "../../salmon/ERR2675465/quant.sf"
ERR2675468 ERR2675469
"../../salmon/ERR2675468/quant.sf" "../../salmon/ERR2675469/quant.sf"
ERR2675472 ERR2675473
"../../salmon/ERR2675472/quant.sf" "../../salmon/ERR2675473/quant.sf"
ERR2675476 ERR2675477
"../../salmon/ERR2675476/quant.sf" "../../salmon/ERR2675477/quant.sf"
ERR2675478 ERR2675479
"../../salmon/ERR2675478/quant.sf" "../../salmon/ERR2675479/quant.sf"
ERR2675480 ERR2675481
"../../salmon/ERR2675480/quant.sf" "../../salmon/ERR2675481/quant.sf"
ERR2675484 ERR2675485
"../../salmon/ERR2675484/quant.sf" "../../salmon/ERR2675485/quant.sf"
Import Salmon NumReads
values with tximport
which contains the transcript-level counts (txIn = TRUE
), abundances and average transcript lengths, then output gene-level summarization (txOut = FALSE
)
txi = tximport(files, type = "salmon", tx2gene = tx2gene,
txIn = TRUE, txOut = FALSE, countsFromAbundance = "no")
> class(txi)
[1] "list"
> names(txi)
[1] "abundance" "counts"
[3] "length" "countsFromAbundance"
Kallisto
files = file.path(paste0(dir, "kallisto"), list.files(paste0(dir, "kallisto")), "abundance.h5")
names(files) = list.files(paste0(dir, "kallisto"))
> files
ERR2675454 ERR2675455
"../../kallisto/ERR2675454/abundance.h5" "../../kallisto/ERR2675455/abundance.h5"
ERR2675458 ERR2675459
"../../kallisto/ERR2675458/abundance.h5" "../../kallisto/ERR2675459/abundance.h5"
ERR2675460 ERR2675461
"../../kallisto/ERR2675460/abundance.h5" "../../kallisto/ERR2675461/abundance.h5"
ERR2675464 ERR2675465
"../../kallisto/ERR2675464/abundance.h5" "../../kallisto/ERR2675465/abundance.h5"
ERR2675468 ERR2675469
"../../kallisto/ERR2675468/abundance.h5" "../../kallisto/ERR2675469/abundance.h5"
ERR2675472 ERR2675473
"../../kallisto/ERR2675472/abundance.h5" "../../kallisto/ERR2675473/abundance.h5"
ERR2675476 ERR2675477
"../../kallisto/ERR2675476/abundance.h5" "../../kallisto/ERR2675477/abundance.h5"
ERR2675478 ERR2675479
"../../kallisto/ERR2675478/abundance.h5" "../../kallisto/ERR2675479/abundance.h5"
ERR2675480 ERR2675481
"../../kallisto/ERR2675480/abundance.h5" "../../kallisto/ERR2675481/abundance.h5"
ERR2675484 ERR2675485
"../../kallisto/ERR2675484/abundance.h5" "../../kallisto/ERR2675485/abundance.h5"
Import Kallisto abundance.h5
files with tximport
txi = tximport(files, type = "kallisto", tx2gene = tx2gene,
txIn = TRUE, txOut = FALSE, countsFromAbundance = "no")
> class(txi)
[1] "list"
> names(txi)
[1] "abundance" "counts"
[3] "infReps" "length"
[5] "countsFromAbundance"
Create DESeqDataSet object
dds = DESeqDataSetFromTximport(txi,
colData = sampleData, ~ individual + paris_classification)
> dds
class: DESeqDataSet
dim: 60233 20
metadata(1): version
assays(2): counts avgTxLength
rownames(60233): ENSG00000000003.15 ENSG00000000005.6 ... ENSG00000288587.1 ENSG00000288588.1
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status
Differential expression analysis
Run DESeq2 analysis using DESeq
, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results
function
dds = DESeq(dds)
> cbind(resultsNames(dds))
[,1]
[1,] "Intercept"
[2,] "individual_S11_vs_S1"
[3,] "individual_S12_vs_S1"
[4,] "individual_S15_vs_S1"
[5,] "individual_S17_vs_S1"
[6,] "individual_S3_vs_S1"
[7,] "individual_S5_vs_S1"
[8,] "individual_S6_vs_S1"
[9,] "individual_S7_vs_S1"
[10,] "individual_S9_vs_S1"
[11,] "paris_classification_0.IIa_vs_normal"
We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha
from 0.1 to 0.05
res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)
At a FDR of 0.05, we have 3274 genes up-regulated and 3342 genes down-regulated in 0-IIa pre-lesion versus Normal
> summary(res)
out of 37313 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 3274, 8.8%
LFC < 0 (down) : 3342, 9%
outliers [1] : 0, 0%
low counts [2] : 15400, 41%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
The results res
object contains the follow columns
> mcols(res)$description
[1] "mean of normalized counts for all samples"
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"
[4] "Wald statistic: paris classification 0.IIa vs normal"
[5] "Wald test p-value: paris classification 0.IIa vs normal"
[6] "BH adjusted p-values"
> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal
Wald test p-value: paris classification 0.IIa vs normal
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSG00000000003.15 1546.35486175611 -0.581311619578096 0.174441213818185 -3.33242131749887 0.000860938112156993 0.00453612331129988
ENSG00000000005.6 33.9170595345844 -0.127395801668489 0.36047271289215 -0.353413163083456 0.723778719010441 0.832554491846499
ENSG00000000419.12 810.269365079278 0.0330353444973176 0.164082074011246 0.201334269428197 0.840437208028287 0.909277206454224
ENSG00000000457.14 433.817917089428 -0.206547272035916 0.12218101991675 -1.69050211052953 0.0909319348233635 0.201842735796633
ENSG00000000460.17 134.776100984046 0.154247180040075 0.221278370791785 0.697073010290807 0.485757103825257 0.65367203488841
ENSG00000000938.13 53.6431744642057 0.0891342530564219 0.288015018855273 0.309477795327096 0.756958100440997 0.855339071823214
Exploring results
MA-plot
We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down
resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal",
type="apeglm")
png("DGE_MA-plot.Salmon.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6),
main = "MA-plot for the shrunken log2 fold changes")
dev.off()
Principal component plot of the samples
rld = rlog(dds)
vsd = vst(dds)
Perform sample PCA for transformed data using plotPCA
, then plot with ggplot
# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DGE_PCA-rlog.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) + theme_bw() +
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DGE_PCA-vst.Salmon.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) +theme_bw() + scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
Volcano plots
We use EnhancedVolcano
to generate volcano plot to visualise the results of differential expression analyses
pCutoff = 0.05
FCcutoff = 1.0
p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)
png("DGE_VolcanoPlots.Salmon.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()
Exporting results
We construct a normData
data.frame to store per-group normalised mean and normalised counts of all samples, and a deData
data.frame to store the DESeq results. We merge both data.frame with a annoData
data.frame that contain per-gene annotation derived from GENCODE GTF file before exporting the results in tabular format
annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))
normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification),
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))
normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'GeneID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DGE_DESeq2_Means_and_NormalisedCount.Salmon.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_DESeq2_DE_results.Salmon.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
> normData[10000:10005,1:12]
GeneID GeneSymbol Chromosome Start End Class Strand Length normal 0-IIa ERR2675454 ERR2675455
20530 ENSG00000206816.1 AP000609.1 chr11 77691650 77691751 misc_RNA + 101 0.0000000 0.00000000 0.000000 0.0000000
42246 ENSG00000254985.1 RSF1-IT2 chr11 77717712 77718411 lncRNA - 699 0.4097166 0.07330207 0.000000 2.9207420
27254 ENSG00000227376.1 FTH1P16 chr11 77734475 77735026 processed_pseudogene - 551 2.3712307 2.06310048 6.536917 0.9725242
42622 ENSG00000255409.1 RSF1-IT1 chr11 77738680 77739568 lncRNA - 888 0.4259810 0.17161175 0.000000 0.9826625
23603 ENSG00000219529.2 AP000580.1 chr11 77813319 77813676 processed_pseudogene - 357 10.6673039 8.50670790 14.406289 6.7761005
1755 ENSG00000087884.14 AAMDC chr11 77821109 77918432 protein_coding + 97323 301.8521056 189.48919021 224.626498 349.5300907
> deData[10000:10005,]
GeneID GeneSymbol Chromosome Start End Class Strand Length baseMean log2fc pvalue padj
20530 ENSG00000206816.1 AP000609.1 chr11 77691650 77691751 misc_RNA + 101 0.0000000 NA NA NA
42246 ENSG00000254985.1 RSF1-IT2 chr11 77717712 77718411 lncRNA - 699 0.2415093 -0.62072985 8.366352e-01 NA
27254 ENSG00000227376.1 FTH1P16 chr11 77734475 77735026 processed_pseudogene - 551 2.2171656 -0.02912317 9.758744e-01 9.860894e-01
42622 ENSG00000255409.1 RSF1-IT1 chr11 77738680 77739568 lncRNA - 888 0.2987963 -0.36554563 9.035198e-01 NA
23603 ENSG00000219529.2 AP000580.1 chr11 77813319 77813676 processed_pseudogene - 357 9.5870059 -0.05559022 9.396577e-01 9.673816e-01
1755 ENSG00000087884.14 AAMDC chr11 77821109 77918432 protein_coding + 97323 245.6706479 -0.68973212 6.942611e-08 9.677700e-07
Session info
> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] EnhancedVolcano_1.4.0 ggrepel_0.8.2
[3] ggplot2_3.3.0 apeglm_1.8.0
[5] DESeq2_1.26.0 SummarizedExperiment_1.16.1
[7] DelayedArray_0.12.3 BiocParallel_1.20.1
[9] matrixStats_0.56.0 tximport_1.14.2
[11] GenomicFeatures_1.38.2 AnnotationDbi_1.48.0
[13] Biobase_2.46.0 GenomicRanges_1.38.0
[15] GenomeInfoDb_1.22.1 IRanges_2.20.2
[17] S4Vectors_0.24.4 BiocGenerics_0.32.0
[19] data.table_1.12.8
loaded via a namespace (and not attached):
[1] bitops_1.0-6 bit64_0.9-7 RColorBrewer_1.1-2
[4] progress_1.2.2 httr_1.4.1 numDeriv_2016.8-1.1
[7] tools_3.6.2 backports_1.1.6 R6_2.4.1
[10] rpart_4.1-15 Hmisc_4.4-0 DBI_1.1.0
[13] colorspace_1.4-1 nnet_7.3-14 withr_2.2.0
[16] tidyselect_1.0.0 gridExtra_2.3 prettyunits_1.1.1
[19] bit_1.1-15.2 curl_4.3 compiler_3.6.2
[22] htmlTable_1.13.3 labeling_0.3 rtracklayer_1.46.0
[25] scales_1.1.0 checkmate_2.0.0 mvtnorm_1.1-0
[28] readr_1.3.1 genefilter_1.68.0 askpass_1.1
[31] rappdirs_0.3.1 stringr_1.4.0 digest_0.6.25
[34] Rsamtools_2.2.3 foreign_0.8-76 XVector_0.26.0
[37] base64enc_0.1-3 jpeg_0.1-8.1 pkgconfig_2.0.3
[40] htmltools_0.4.0 bbmle_1.0.23.1 dbplyr_1.4.3
[43] htmlwidgets_1.5.1 rlang_0.4.6 rstudioapi_0.11
[46] RSQLite_2.2.0 farver_2.0.3 jsonlite_1.6.1
[49] acepack_1.4.1 dplyr_0.8.5 RCurl_1.98-1.2
[52] magrittr_1.5 GenomeInfoDbData_1.2.2 Formula_1.2-3
[55] Matrix_1.2-18 Rcpp_1.0.4.6 munsell_0.5.0
[58] lifecycle_0.2.0 stringi_1.4.6 MASS_7.3-51.6
[61] zlibbioc_1.32.0 plyr_1.8.6 BiocFileCache_1.10.2
[64] grid_3.6.2 blob_1.2.1 bdsmatrix_1.3-4
[67] crayon_1.3.4 lattice_0.20-41 Biostrings_2.54.0
[70] splines_3.6.2 annotate_1.64.0 hms_0.5.3
[73] locfit_1.5-9.4 knitr_1.28 pillar_1.4.4
[76] geneplotter_1.64.0 biomaRt_2.42.1 XML_3.99-0.3
[79] glue_1.4.0 latticeExtra_0.6-29 png_0.1-7
[82] vctrs_0.2.4 gtable_0.3.0 openssl_1.4.1
[85] purrr_0.3.4 assertthat_0.2.1 emdbook_1.3.12
[88] xfun_0.13 xtable_1.8-4 coda_0.19-3
[91] survival_3.1-12 tibble_3.0.1 GenomicAlignments_1.22.1
[94] memoise_1.1.0 cluster_2.1.0 ellipsis_0.3.0
Last updated
Was this helpful?