DTE analysis with STAR + RSEM input
Load required libraries
library(data.table)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)
Import input data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
files = list.files(paste0(dir, "rsem_star"), "*soforms.results$", full.names = T)
> files
[1] "../../rsem_star/ERR2675454.isoforms.results"
[2] "../../rsem_star/ERR2675455.isoforms.results"
[3] "../../rsem_star/ERR2675458.isoforms.results"
[4] "../../rsem_star/ERR2675459.isoforms.results"
[5] "../../rsem_star/ERR2675460.isoforms.results"
[6] "../../rsem_star/ERR2675461.isoforms.results"
[7] "../../rsem_star/ERR2675464.isoforms.results"
[8] "../../rsem_star/ERR2675465.isoforms.results"
[9] "../../rsem_star/ERR2675468.isoforms.results"
[10] "../../rsem_star/ERR2675469.isoforms.results"
[11] "../../rsem_star/ERR2675472.isoforms.results"
[12] "../../rsem_star/ERR2675473.isoforms.results"
[13] "../../rsem_star/ERR2675476.isoforms.results"
[14] "../../rsem_star/ERR2675477.isoforms.results"
[15] "../../rsem_star/ERR2675478.isoforms.results"
[16] "../../rsem_star/ERR2675479.isoforms.results"
[17] "../../rsem_star/ERR2675480.isoforms.results"
[18] "../../rsem_star/ERR2675481.isoforms.results"
[19] "../../rsem_star/ERR2675484.isoforms.results"
[20] "../../rsem_star/ERR2675485.isoforms.results"
Import RSEM result file and keep the 5th column containing the expected_count
values. Build a countData
data.frame to store counts
countData = data.frame(fread(files[1]))[c(1,5)]
# Loop and read the 5th column remaining files
for(i in 2:length(files)) {
countData = cbind(countData, data.frame(fread(files[i]))[5])
}
colnames(countData) = c("GeneID", gsub(paste0(dir,"rsem_star/"), "", files))
colnames(countData) = gsub(".isoforms.results", "", colnames(countData))
rownames(countData) = countData$GeneID
countData = countData[,c(2:ncol(countData))]
countData = round(countData) # convert to integer
> countData[1:10,1:5]
ERR2675454 ERR2675455 ERR2675458 ERR2675459 ERR2675460
ENST00000373020.9 874 1921 627 1706 944
ENST00000494424.1 0 0 0 0 0
ENST00000496771.5 3 4 16 19 37
ENST00000612152.4 0 0 0 0 0
ENST00000614008.4 114 103 60 58 60
ENST00000373031.5 29 26 9 36 22
ENST00000485971.1 5 0 0 9 0
ENST00000371582.8 26 54 28 51 54
ENST00000371584.8 18 16 39 35 36
ENST00000371588.9 638 781 796 489 622
Build a DESeqDataSet
from countData
with DESeqDataSetFromMatrix
, providing also the sample information and a design formula
dds = DESeqDataSetFromMatrix(countData = countData,
colData = sampleData, design = ~ individual + paris_classification)
> dds
class: DESeqDataSet
dim: 227912 20
metadata(1): version
assays(1): counts
rownames(227912): ENST00000373020.9 ENST00000494424.1 ... ENST00000673857.1 ENST00000673884.1
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status
* Import input data (tximport)
Alternatively, tximport
can be used to import expected_count
from RSEM, as follows
library(tximport)
txi = tximport(files, type = "none", txIn = TRUE, txOut = TRUE,
txIdCol = "transcript_id", abundanceCol = "TPM",
countsCol = "expected_count", lengthCol = "effective_length",
importer = function(x) readr::read_tsv(x))
# fix "Error: all(lengths > 0) is not TRUE" error
txi$length[txi$length == 0] = 0.01
dds = DESeqDataSetFromTximport(txi,
colData = sampleData, ~ individual + paris_classification)
> dds
class: DESeqDataSet
dim: 227912 20
metadata(1): version
assays(2): counts avgTxLength
rownames(227912): ENST00000373020.9 ENST00000494424.1 ... ENST00000673857.1 ENST00000673884.1
rowData names(0):
colnames: NULL
colData names(11): ENA_RUN individual ... braf_status kras_status
Differential expression analysis
Run DESeq2 analysis using DESeq
, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results
function
dds = DESeq(dds)
> cbind(resultsNames(dds))
[,1]
[1,] "Intercept"
[2,] "individual_S11_vs_S1"
[3,] "individual_S12_vs_S1"
[4,] "individual_S15_vs_S1"
[5,] "individual_S17_vs_S1"
[6,] "individual_S3_vs_S1"
[7,] "individual_S5_vs_S1"
[8,] "individual_S6_vs_S1"
[9,] "individual_S7_vs_S1"
[10,] "individual_S9_vs_S1"
[11,] "paris_classification_0.IIa_vs_normal"
We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha
from 0.1 to 0.05
res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)
At a FDR of 0.05, we have 2945 genes up-regulated and 2871 genes down-regulated in 0-IIa pre-lesion versus Normal
> summary(res)
out of 159634 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 4210, 2.6%
LFC < 0 (down) : 3136, 2%
outliers [1] : 0, 0%
low counts [2] : 87733, 55%
(mean count < 3)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
The results res
object contains the follow columns
> mcols(res)$description
[1] "mean of normalized counts for all samples"
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"
[4] "Wald statistic: paris classification 0.IIa vs normal"
[5] "Wald test p-value: paris classification 0.IIa vs normal"
[6] "BH adjusted p-values"
> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal
Wald test p-value: paris classification 0.IIa vs normal
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENST00000373020.9 1187.48517486989 -0.63220378627308 0.194294561388125 -3.25384190764961 0.00113855622353168 0.0158582088422363
ENST00000494424.1 0 NA NA NA NA NA
ENST00000496771.5 10.8614532030735 0.332252420760318 0.385893227572389 0.860995728923465 0.389240395417396 0.748731313595519
ENST00000612152.4 0.77778022898225 1.06375257452071 3.00091767488387 0.354475760339502 0.722982366693673 NA
ENST00000614008.4 83.2226874117913 -0.176481709375863 0.356493402266472 -0.495049019852396 0.620565518581719 0.892120965062597
ENST00000373031.5 25.5200220663835 0.0278313866560059 0.389927929551535 0.0713757198362666 0.943098733077851 0.990814121769475
Read about authors' note on p-values and adjusted p-values set to NA here
Exploring results
MA-plot
We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down
resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal",
type="apeglm")
png("DTE_MA-plot.RSEM.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6),
main = "MA-plot for the shrunken log2 fold changes")
dev.off()
Principal component plot of the samples
First, we perform count data transformation with regularized logarithm rlog
or variance stabilizing transformations vst
. You can read here about which transformation to choose
rld = rlog(dds)
vsd = vst(dds)
Perform sample PCA for transformed data using plotPCA
, then plot with ggplot
# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DTE_PCA-rlog.RSEM.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) + theme_bw() +
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DTE_PCA-vst.RSEM.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) + theme_bw() +
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
Volcano plots
We use EnhancedVolcano
to generate volcano plot to visualise the results of differential expression analyses
pCutoff = 0.05
FCcutoff = 1.0
p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)
png("DTE_VolcanoPlots.RSEM.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()
Exporting results
We construct a normData
data.frame to store per-group normalised mean and normalised counts of all samples, and a deData
data.frame to store the DESeq results. We merge both data.frame with a annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format
annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))
normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification),
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))
normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'TranscriptID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'TranscriptID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DTE_DESeq2_Means_and_NormalisedCount.RSEM.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DTE_DESeq2_DE_results.RSEM.txt", sep = "\t",
quote = F, row.names = F, col.names = T)
> normData[normData$GeneSymbol == "ZIC2", 1:14]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length normal 0-IIa ERR2675454 ERR2675455
25492 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 1.658049 168.7328173 260.0938 0
93460 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 0.000000 0.0000000 0.0000 0
75945 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 0.000000 0.7729482 0.0000 0
83115 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 0.000000 0.8654115 0.0000 0
86616 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 0.000000 0.0000000 0.0000 0
> deData[deData$GeneSymbol == "ZIC2",]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length baseMean log2fc pvalue padj
25492 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 85.1954333 6.7850301 2.059453e-42 5.288456e-39
93460 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 0.0000000 NA NA NA
75945 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 0.3864741 0.6503018 8.289198e-01 NA
83115 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 0.4327057 1.2410927 6.797049e-01 NA
86616 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 0.0000000 NA NA NA
Session info
> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] EnhancedVolcano_1.4.0 ggrepel_0.8.2
[3] ggplot2_3.3.0 apeglm_1.8.0
[5] DESeq2_1.26.0 SummarizedExperiment_1.16.1
[7] DelayedArray_0.12.3 BiocParallel_1.20.1
[9] matrixStats_0.56.0 Biobase_2.46.0
[11] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
[13] IRanges_2.20.2 S4Vectors_0.24.4
[15] BiocGenerics_0.32.0 data.table_1.12.8
loaded via a namespace (and not attached):
[1] bit64_0.9-7 splines_3.6.2 Formula_1.2-3
[4] assertthat_0.2.1 latticeExtra_0.6-29 blob_1.2.1
[7] GenomeInfoDbData_1.2.2 numDeriv_2016.8-1.1 pillar_1.4.4
[10] RSQLite_2.2.0 backports_1.1.6 lattice_0.20-41
[13] glue_1.4.0 bbmle_1.0.23.1 digest_0.6.25
[16] RColorBrewer_1.1-2 XVector_0.26.0 checkmate_2.0.0
[19] colorspace_1.4-1 plyr_1.8.6 htmltools_0.4.0
[22] Matrix_1.2-18 XML_3.99-0.3 pkgconfig_2.0.3
[25] emdbook_1.3.12 genefilter_1.68.0 zlibbioc_1.32.0
[28] mvtnorm_1.1-0 purrr_0.3.4 xtable_1.8-4
[31] scales_1.1.0 jpeg_0.1-8.1 htmlTable_1.13.3
[34] tibble_3.0.1 annotate_1.64.0 farver_2.0.3
[37] ellipsis_0.3.0 withr_2.2.0 nnet_7.3-14
[40] survival_3.1-12 magrittr_1.5 crayon_1.3.4
[43] memoise_1.1.0 MASS_7.3-51.6 foreign_0.8-76
[46] tools_3.6.2 lifecycle_0.2.0 stringr_1.4.0
[49] locfit_1.5-9.4 munsell_0.5.0 cluster_2.1.0
[52] AnnotationDbi_1.48.0 compiler_3.6.2 rlang_0.4.6
[55] grid_3.6.2 RCurl_1.98-1.2 rstudioapi_0.11
[58] htmlwidgets_1.5.1 labeling_0.3 bitops_1.0-6
[61] base64enc_0.1-3 gtable_0.3.0 DBI_1.1.0
[64] R6_2.4.1 gridExtra_2.3 bdsmatrix_1.3-4
[67] knitr_1.28 dplyr_0.8.5 bit_1.1-15.2
[70] Hmisc_4.4-0 stringi_1.4.6 Rcpp_1.0.4.6
[73] geneplotter_1.64.0 vctrs_0.2.4 rpart_4.1-15
[76] acepack_1.4.1 png_0.1-7 coda_0.19-3
[79] tidyselect_1.0.0 xfun_0.13
Last updated