DGE analysis with STAR + RSEM input
Load required libraries
library(data.table)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)
Import input data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
files = list.files(paste0(dir, "rsem_star"), "*genes.results$", full.names = T)
> files
[1] "../../rsem_star/ERR2675454.genes.results"
[2] "../../rsem_star/ERR2675455.genes.results"
[3] "../../rsem_star/ERR2675458.genes.results"
[4] "../../rsem_star/ERR2675459.genes.results"
[5] "../../rsem_star/ERR2675460.genes.results"
[6] "../../rsem_star/ERR2675461.genes.results"
[7] "../../rsem_star/ERR2675464.genes.results"
[8] "../../rsem_star/ERR2675465.genes.results"
[9] "../../rsem_star/ERR2675468.genes.results"
[10] "../../rsem_star/ERR2675469.genes.results"
[11] "../../rsem_star/ERR2675472.genes.results"
[12] "../../rsem_star/ERR2675473.genes.results"
[13] "../../rsem_star/ERR2675476.genes.results"
[14] "../../rsem_star/ERR2675477.genes.results"
[15] "../../rsem_star/ERR2675478.genes.results"
[16] "../../rsem_star/ERR2675479.genes.results"
[17] "../../rsem_star/ERR2675480.genes.results"
[18] "../../rsem_star/ERR2675481.genes.results"
[19] "../../rsem_star/ERR2675484.genes.results"
[20] "../../rsem_star/ERR2675485.genes.results"
Import RSEM result file and keep the 5th column containing the expected_count
values. Build a countData
data.frame to store counts
countData = data.frame(fread(files[1]))[c(1,5)]
# Loop and read the 5th column remaining files
for(i in 2:length(files)) {
countData = cbind(countData, data.frame(fread(files[i]))[5])
}
colnames(countData) = c("GeneID", gsub(paste0(dir,"rsem_star/"), "", files))
colnames(countData) = gsub(".genes.results", "", colnames(countData))
rownames(countData) = countData$GeneID
countData = countData[,c(2:ncol(countData))]
countData = round(countData) # round to integer
> countData[1:10,1:5]
ERR2675454 ERR2675455 ERR2675458 ERR2675459 ERR2675460
ENSG00000000003.15 991 2027 703 1783 1041
ENSG00000000005.6 34 26 9 45 22
ENSG00000000419.12 687 855 871 581 721
ENSG00000000457.14 334 451 290 448 411
ENSG00000000460.17 165 145 136 75 88
ENSG00000000938.13 36 32 48 35 50
ENSG00000000971.16 622 330 505 194 306
ENSG00000001036.14 1508 1668 1553 1315 1366
ENSG00000001084.13 887 994 876 726 977
ENSG00000001167.14 600 581 462 426 424
Build a DESeqDataSet
from countData
with DESeqDataSetFromMatrix
, providing also the sample information and a design formula
dds = DESeqDataSetFromMatrix(countData = countData,
colData = sampleData, design = ~ individual + paris_classification)
> dds
class: DESeqDataSet
dim: 60662 20
metadata(1): version
assays(1): counts
rownames(60662): ENSG00000000003.15 ENSG00000000005.6 ... ENSG00000288587.1 ENSG00000288588.1
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status
* Import input data (tximport)
Alternatively, tximport
can be used to import expected_count
from RSEM, as follows
library(tximport)
txi = tximport(files, type = "none", txIn = FALSE, txOut = FALSE,
geneIdCol = "gene_id", abundanceCol = "TPM",
countsCol = "expected_count", lengthCol = "effective_length",
importer = function(x) readr::read_tsv(x))
# fix "Error: all(lengths > 0) is not TRUE" error
txi$length[txi$length == 0] = 0.01
dds = DESeqDataSetFromTximport(txi,
colData = sampleData, ~ individual + paris_classification)
> dds
class: DESeqDataSet
dim: 60662 20
metadata(1): version
assays(2): counts avgTxLength
rownames(60662): ENSG00000000003.15 ENSG00000000005.6 ... ENSG00000288587.1 ENSG00000288588.1
rowData names(0):
colnames: NULL
colData names(11): ENA_RUN individual ... braf_status kras_status
Differential expression analysis
Run DESeq2 analysis using DESeq
, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results
function
dds = DESeq(dds)
> cbind(resultsNames(dds))
[,1]
[1,] "Intercept"
[2,] "individual_S11_vs_S1"
[3,] "individual_S12_vs_S1"
[4,] "individual_S15_vs_S1"
[5,] "individual_S17_vs_S1"
[6,] "individual_S3_vs_S1"
[7,] "individual_S5_vs_S1"
[8,] "individual_S6_vs_S1"
[9,] "individual_S7_vs_S1"
[10,] "individual_S9_vs_S1"
[11,] "paris_classification_0.IIa_vs_normal"
We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha
from 0.1 to 0.05
res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)
At a FDR of 0.05, we have 2945 genes up-regulated and 2871 genes down-regulated in 0-IIa pre-lesion versus Normal
> summary(res)
out of 35864 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 2945, 8.2%
LFC < 0 (down) : 2871, 8%
outliers [1] : 0, 0%
low counts [2] : 13400, 37%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
The results res
object contains the follow columns
> mcols(res)$description
[1] "mean of normalized counts for all samples"
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"
[4] "Wald statistic: paris classification 0.IIa vs normal"
[5] "Wald test p-value: paris classification 0.IIa vs normal"
[6] "BH adjusted p-values"
> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal
Wald test p-value: paris classification 0.IIa vs normal
DataFrame with 6 rows and 6 columns
baseMean log2FoldChange lfcSE stat pvalue padj
<numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
ENSG00000000003.15 1322.39814015782 -0.6354111149118 0.186441123902248 -3.40810601015766 0.000654154755926379 0.00412321668507241
ENSG00000000005.6 28.3907815383573 0.00849284178712473 0.383172558325235 0.0221645355404498 0.982316707175391 0.991962409559852
ENSG00000000419.12 664.582491554523 0.0422102090503606 0.183208609150749 0.230394244277183 0.817785433270278 0.905444912804091
ENSG00000000457.14 366.617875491445 -0.209062144211674 0.134838618582548 -1.55046192559209 0.121030685231956 0.271340650005056
ENSG00000000460.17 115.687405202335 0.173294639670984 0.244996023627107 0.707336540019706 0.479357363226764 0.673928736360463
ENSG00000000938.13 45.4250053873607 0.22813646639418 0.242679457417302 0.940073250624938 0.34717998860563 0.551598505130269
Read about authors' note on p-values and adjusted p-values set to NA here
Exploring results
MA-plot
We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down
resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal",
type="apeglm")
png("DGE_MA-plot.RSEM.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6),
main = "MA-plot for the shrunken log2 fold changes")
dev.off()
Principal component plot of the samples
First, we perform count data transformation with regularized logarithm rlog
or variance stabilizing transformations vst
. You can read here about which transformation to choose
rld = rlog(dds)
vsd = vst(dds)
Perform sample PCA for transformed data using plotPCA
, then plot with ggplot
# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DGE_PCA-rlog.RSEM.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) + theme_bw() +
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"),
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))
png("DGE_PCA-vst.RSEM.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) +
geom_point(size = 2) + theme_bw() +
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
Volcano plots
We use EnhancedVolcano
to generate volcano plot to visualise the results of differential expression analyses
pCutoff = 0.05
FCcutoff = 1.0
p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)
png("DGE_VolcanoPlots.RSEM.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()
Exporting results
We construct a normData
data.frame to store per-group normalised mean and normalised counts of all samples, and a deData
data.frame to store the DESeq results. We merge both data.frame with a annoData
data.frame that contain per-gene annotation derived from GENCODE GTF file before exporting the results in tabular format
annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))
normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification),
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))
normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'GeneID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DGE_DESeq2_Means_and_NormalisedCount.RSEM.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_DESeq2_DE_results.RSEM.txt",
sep = "\t", quote = F, row.names = F, col.names = T)
> normData[10000:10005,1:12]
GeneID GeneSymbol Chromosome Start End Class Strand Length normal 0-IIa ERR2675454 ERR2675455
9184 ENSG00000149257.14 SERPINH1 chr11 75562056 75572783 protein_coding + 10727 779.2201385 820.540019 559.218685 791.96615
42777 ENSG00000255326.1 AP001922.5 chr11 75583196 75594665 lncRNA + 11469 0.3663639 1.243649 0.000000 0.00000
13087 ENSG00000171533.11 MAP6 chr11 75586918 75669120 protein_coding - 82202 11.4438896 10.721829 3.289522 18.77121
42876 ENSG00000255434.1 AP001922.6 chr11 75596144 75597270 lncRNA - 1126 0.0000000 0.000000 0.000000 0.00000
42172 ENSG00000254630.1 AP001922.2 chr11 75635883 75638587 lncRNA - 2704 0.0000000 0.000000 0.000000 0.00000
42735 ENSG00000255280.1 AP001922.4 chr11 75642600 75642889 processed_pseudogene + 289 0.0000000 0.000000 0.000000 0.00000
> deData[10000:10005,]
GeneID GeneSymbol Chromosome Start End Class Strand Length baseMean log2fc pvalue padj
9184 ENSG00000149257.14 SERPINH1 chr11 75562056 75572783 protein_coding + 10727 799.8800786 0.07596462 0.6545334 0.8026771
42777 ENSG00000255326.1 AP001922.5 chr11 75583196 75594665 lncRNA + 11469 0.8050064 1.39940614 0.3344497 NA
13087 ENSG00000171533.11 MAP6 chr11 75586918 75669120 protein_coding - 82202 11.0828594 -0.01614570 0.9673578 0.9862002
42876 ENSG00000255434.1 AP001922.6 chr11 75596144 75597270 lncRNA - 1126 0.0000000 NA NA NA
42172 ENSG00000254630.1 AP001922.2 chr11 75635883 75638587 lncRNA - 2704 0.0000000 NA NA NA
42735 ENSG00000255280.1 AP001922.4 chr11 75642600 75642889 processed_pseudogene + 289 0.0000000 NA NA NA
Session info
> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats4 stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] EnhancedVolcano_1.4.0 ggrepel_0.8.2
[3] ggplot2_3.3.0 apeglm_1.8.0
[5] DESeq2_1.26.0 SummarizedExperiment_1.16.1
[7] DelayedArray_0.12.3 BiocParallel_1.20.1
[9] matrixStats_0.56.0 Biobase_2.46.0
[11] GenomicRanges_1.38.0 GenomeInfoDb_1.22.1
[13] IRanges_2.20.2 S4Vectors_0.24.4
[15] BiocGenerics_0.32.0 data.table_1.12.8
loaded via a namespace (and not attached):
[1] bit64_0.9-7 splines_3.6.2 Formula_1.2-3
[4] assertthat_0.2.1 latticeExtra_0.6-29 blob_1.2.1
[7] GenomeInfoDbData_1.2.2 numDeriv_2016.8-1.1 pillar_1.4.4
[10] RSQLite_2.2.0 backports_1.1.6 lattice_0.20-41
[13] glue_1.4.0 bbmle_1.0.23.1 digest_0.6.25
[16] RColorBrewer_1.1-2 XVector_0.26.0 checkmate_2.0.0
[19] colorspace_1.4-1 plyr_1.8.6 htmltools_0.4.0
[22] Matrix_1.2-18 XML_3.99-0.3 pkgconfig_2.0.3
[25] emdbook_1.3.12 genefilter_1.68.0 zlibbioc_1.32.0
[28] mvtnorm_1.1-0 purrr_0.3.4 xtable_1.8-4
[31] scales_1.1.0 jpeg_0.1-8.1 htmlTable_1.13.3
[34] tibble_3.0.1 annotate_1.64.0 farver_2.0.3
[37] ellipsis_0.3.0 withr_2.2.0 nnet_7.3-14
[40] survival_3.1-12 magrittr_1.5 crayon_1.3.4
[43] memoise_1.1.0 MASS_7.3-51.6 foreign_0.8-76
[46] tools_3.6.2 lifecycle_0.2.0 stringr_1.4.0
[49] locfit_1.5-9.4 munsell_0.5.0 cluster_2.1.0
[52] AnnotationDbi_1.48.0 compiler_3.6.2 rlang_0.4.6
[55] grid_3.6.2 RCurl_1.98-1.2 rstudioapi_0.11
[58] htmlwidgets_1.5.1 labeling_0.3 bitops_1.0-6
[61] base64enc_0.1-3 gtable_0.3.0 DBI_1.1.0
[64] R6_2.4.1 gridExtra_2.3 bdsmatrix_1.3-4
[67] knitr_1.28 dplyr_0.8.5 bit_1.1-15.2
[70] Hmisc_4.4-0 stringi_1.4.6 Rcpp_1.0.4.6
[73] geneplotter_1.64.0 vctrs_0.2.4 rpart_4.1-15
[76] acepack_1.4.1 png_0.1-7 coda_0.19-3
[79] tidyselect_1.0.0 xfun_0.13
Last updated