DGE analysis with STAR + RSEM input

Load required libraries

library(data.table)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)

Import input data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

files = list.files(paste0(dir, "rsem_star"), "*genes.results$", full.names = T)
> files
 [1] "../../rsem_star/ERR2675454.genes.results"
 [2] "../../rsem_star/ERR2675455.genes.results"
 [3] "../../rsem_star/ERR2675458.genes.results"
 [4] "../../rsem_star/ERR2675459.genes.results"
 [5] "../../rsem_star/ERR2675460.genes.results"
 [6] "../../rsem_star/ERR2675461.genes.results"
 [7] "../../rsem_star/ERR2675464.genes.results"
 [8] "../../rsem_star/ERR2675465.genes.results"
 [9] "../../rsem_star/ERR2675468.genes.results"
[10] "../../rsem_star/ERR2675469.genes.results"
[11] "../../rsem_star/ERR2675472.genes.results"
[12] "../../rsem_star/ERR2675473.genes.results"
[13] "../../rsem_star/ERR2675476.genes.results"
[14] "../../rsem_star/ERR2675477.genes.results"
[15] "../../rsem_star/ERR2675478.genes.results"
[16] "../../rsem_star/ERR2675479.genes.results"
[17] "../../rsem_star/ERR2675480.genes.results"
[18] "../../rsem_star/ERR2675481.genes.results"
[19] "../../rsem_star/ERR2675484.genes.results"
[20] "../../rsem_star/ERR2675485.genes.results"

Import RSEM result file and keep the 5th column containing the expected_count values. Build a countData data.frame to store counts

countData = data.frame(fread(files[1]))[c(1,5)]

# Loop and read the 5th column remaining files
for(i in 2:length(files)) {
        countData = cbind(countData, data.frame(fread(files[i]))[5])
}

colnames(countData) = c("GeneID", gsub(paste0(dir,"rsem_star/"), "", files))
colnames(countData) = gsub(".genes.results", "", colnames(countData))
rownames(countData) = countData$GeneID

countData = countData[,c(2:ncol(countData))]
countData = round(countData)    # round to integer
> countData[1:10,1:5]
                   ERR2675454 ERR2675455 ERR2675458 ERR2675459 ERR2675460
ENSG00000000003.15        991       2027        703       1783       1041
ENSG00000000005.6          34         26          9         45         22
ENSG00000000419.12        687        855        871        581        721
ENSG00000000457.14        334        451        290        448        411
ENSG00000000460.17        165        145        136         75         88
ENSG00000000938.13         36         32         48         35         50
ENSG00000000971.16        622        330        505        194        306
ENSG00000001036.14       1508       1668       1553       1315       1366
ENSG00000001084.13        887        994        876        726        977
ENSG00000001167.14        600        581        462        426        424

Build a DESeqDataSet from countData with DESeqDataSetFromMatrix, providing also the sample information and a design formula

dds = DESeqDataSetFromMatrix(countData = countData, 
colData = sampleData, design = ~ individual + paris_classification)
> dds
class: DESeqDataSet 
dim: 60662 20 
metadata(1): version
assays(1): counts
rownames(60662): ENSG00000000003.15 ENSG00000000005.6 ... ENSG00000288587.1 ENSG00000288588.1
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status

* Import input data (tximport)

Alternatively, tximport can be used to import expected_count from RSEM, as follows

library(tximport)

txi = tximport(files, type = "none", txIn = FALSE, txOut = FALSE, 
geneIdCol = "gene_id", abundanceCol = "TPM", 
countsCol = "expected_count", lengthCol = "effective_length",
importer = function(x) readr::read_tsv(x))

# fix "Error: all(lengths > 0) is not TRUE" error
txi$length[txi$length == 0] = 0.01

dds = DESeqDataSetFromTximport(txi, 
colData = sampleData, ~ individual + paris_classification)
> dds
class: DESeqDataSet 
dim: 60662 20 
metadata(1): version
assays(2): counts avgTxLength
rownames(60662): ENSG00000000003.15 ENSG00000000005.6 ... ENSG00000288587.1 ENSG00000288588.1
rowData names(0):
colnames: NULL
colData names(11): ENA_RUN individual ... braf_status kras_status

Differential expression analysis

Run DESeq2 analysis using DESeq, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results function

dds = DESeq(dds)
> cbind(resultsNames(dds))
      [,1]                                  
 [1,] "Intercept"                           
 [2,] "individual_S11_vs_S1"                
 [3,] "individual_S12_vs_S1"                
 [4,] "individual_S15_vs_S1"                
 [5,] "individual_S17_vs_S1"                
 [6,] "individual_S3_vs_S1"                 
 [7,] "individual_S5_vs_S1"                 
 [8,] "individual_S6_vs_S1"                 
 [9,] "individual_S7_vs_S1"                 
[10,] "individual_S9_vs_S1"                 
[11,] "paris_classification_0.IIa_vs_normal"

We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha from 0.1 to 0.05

res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)

At a FDR of 0.05, we have 2945 genes up-regulated and 2871 genes down-regulated in 0-IIa pre-lesion versus Normal

> summary(res)

out of 35864 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 2945, 8.2%
LFC < 0 (down)     : 2871, 8%
outliers [1]       : 0, 0%
low counts [2]     : 13400, 37%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

The results res object contains the follow columns

> mcols(res)$description
[1] "mean of normalized counts for all samples"                   
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"        
[4] "Wald statistic: paris classification 0.IIa vs normal"        
[5] "Wald test p-value: paris classification 0.IIa vs normal"     
[6] "BH adjusted p-values"                                        

> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal 
Wald test p-value: paris classification 0.IIa vs normal 
DataFrame with 6 rows and 6 columns
                           baseMean      log2FoldChange             lfcSE               stat               pvalue                padj
                          <numeric>           <numeric>         <numeric>          <numeric>            <numeric>           <numeric>
ENSG00000000003.15 1322.39814015782    -0.6354111149118 0.186441123902248  -3.40810601015766 0.000654154755926379 0.00412321668507241
ENSG00000000005.6  28.3907815383573 0.00849284178712473 0.383172558325235 0.0221645355404498    0.982316707175391   0.991962409559852
ENSG00000000419.12 664.582491554523  0.0422102090503606 0.183208609150749  0.230394244277183    0.817785433270278   0.905444912804091
ENSG00000000457.14 366.617875491445  -0.209062144211674 0.134838618582548  -1.55046192559209    0.121030685231956   0.271340650005056
ENSG00000000460.17 115.687405202335   0.173294639670984 0.244996023627107  0.707336540019706    0.479357363226764   0.673928736360463
ENSG00000000938.13 45.4250053873607    0.22813646639418 0.242679457417302  0.940073250624938     0.34717998860563   0.551598505130269

Read about authors' note on p-values and adjusted p-values set to NA here

Exploring results

MA-plot

We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down

resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal", 
type="apeglm")

png("DGE_MA-plot.RSEM.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6), 
main = "MA-plot for the shrunken log2 fold changes")
dev.off()

Principal component plot of the samples

First, we perform count data transformation with regularized logarithm rlog or variance stabilizing transformations vst. You can read here about which transformation to choose

rld = rlog(dds)
vsd = vst(dds)

Perform sample PCA for transformed data using plotPCA, then plot with ggplot

# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DGE_PCA-rlog.RSEM.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) + theme_bw() + 
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DGE_PCA-vst.RSEM.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) + theme_bw() + 
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()

Volcano plots

We use EnhancedVolcano to generate volcano plot to visualise the results of differential expression analyses

pCutoff = 0.05
FCcutoff = 1.0

p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)

png("DGE_VolcanoPlots.RSEM.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()

Exporting results

We construct a normData data.frame to store per-group normalised mean and normalised counts of all samples, and a deData data.frame to store the DESeq results. We merge both data.frame with a annoData data.frame that contain per-gene annotation derived from GENCODE GTF file before exporting the results in tabular format

annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))

normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification), 
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))

normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'GeneID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DGE_DESeq2_Means_and_NormalisedCount.RSEM.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_DESeq2_DE_results.RSEM.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)
> normData[10000:10005,1:12]
                  GeneID GeneSymbol Chromosome    Start      End                Class Strand Length      normal      0-IIa ERR2675454 ERR2675455
9184  ENSG00000149257.14   SERPINH1      chr11 75562056 75572783       protein_coding      +  10727 779.2201385 820.540019 559.218685  791.96615
42777  ENSG00000255326.1 AP001922.5      chr11 75583196 75594665               lncRNA      +  11469   0.3663639   1.243649   0.000000    0.00000
13087 ENSG00000171533.11       MAP6      chr11 75586918 75669120       protein_coding      -  82202  11.4438896  10.721829   3.289522   18.77121
42876  ENSG00000255434.1 AP001922.6      chr11 75596144 75597270               lncRNA      -   1126   0.0000000   0.000000   0.000000    0.00000
42172  ENSG00000254630.1 AP001922.2      chr11 75635883 75638587               lncRNA      -   2704   0.0000000   0.000000   0.000000    0.00000
42735  ENSG00000255280.1 AP001922.4      chr11 75642600 75642889 processed_pseudogene      +    289   0.0000000   0.000000   0.000000    0.00000

> deData[10000:10005,]
                  GeneID GeneSymbol Chromosome    Start      End                Class Strand Length    baseMean      log2fc    pvalue      padj
9184  ENSG00000149257.14   SERPINH1      chr11 75562056 75572783       protein_coding      +  10727 799.8800786  0.07596462 0.6545334 0.8026771
42777  ENSG00000255326.1 AP001922.5      chr11 75583196 75594665               lncRNA      +  11469   0.8050064  1.39940614 0.3344497        NA
13087 ENSG00000171533.11       MAP6      chr11 75586918 75669120       protein_coding      -  82202  11.0828594 -0.01614570 0.9673578 0.9862002
42876  ENSG00000255434.1 AP001922.6      chr11 75596144 75597270               lncRNA      -   1126   0.0000000          NA        NA        NA
42172  ENSG00000254630.1 AP001922.2      chr11 75635883 75638587               lncRNA      -   2704   0.0000000          NA        NA        NA
42735  ENSG00000255280.1 AP001922.4      chr11 75642600 75642889 processed_pseudogene      +    289   0.0000000          NA        NA        NA

Session info

> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] EnhancedVolcano_1.4.0       ggrepel_0.8.2              
 [3] ggplot2_3.3.0               apeglm_1.8.0               
 [5] DESeq2_1.26.0               SummarizedExperiment_1.16.1
 [7] DelayedArray_0.12.3         BiocParallel_1.20.1        
 [9] matrixStats_0.56.0          Biobase_2.46.0             
[11] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
[13] IRanges_2.20.2              S4Vectors_0.24.4           
[15] BiocGenerics_0.32.0         data.table_1.12.8          

loaded via a namespace (and not attached):
 [1] bit64_0.9-7            splines_3.6.2          Formula_1.2-3         
 [4] assertthat_0.2.1       latticeExtra_0.6-29    blob_1.2.1            
 [7] GenomeInfoDbData_1.2.2 numDeriv_2016.8-1.1    pillar_1.4.4          
[10] RSQLite_2.2.0          backports_1.1.6        lattice_0.20-41       
[13] glue_1.4.0             bbmle_1.0.23.1         digest_0.6.25         
[16] RColorBrewer_1.1-2     XVector_0.26.0         checkmate_2.0.0       
[19] colorspace_1.4-1       plyr_1.8.6             htmltools_0.4.0       
[22] Matrix_1.2-18          XML_3.99-0.3           pkgconfig_2.0.3       
[25] emdbook_1.3.12         genefilter_1.68.0      zlibbioc_1.32.0       
[28] mvtnorm_1.1-0          purrr_0.3.4            xtable_1.8-4          
[31] scales_1.1.0           jpeg_0.1-8.1           htmlTable_1.13.3      
[34] tibble_3.0.1           annotate_1.64.0        farver_2.0.3          
[37] ellipsis_0.3.0         withr_2.2.0            nnet_7.3-14           
[40] survival_3.1-12        magrittr_1.5           crayon_1.3.4          
[43] memoise_1.1.0          MASS_7.3-51.6          foreign_0.8-76        
[46] tools_3.6.2            lifecycle_0.2.0        stringr_1.4.0         
[49] locfit_1.5-9.4         munsell_0.5.0          cluster_2.1.0         
[52] AnnotationDbi_1.48.0   compiler_3.6.2         rlang_0.4.6           
[55] grid_3.6.2             RCurl_1.98-1.2         rstudioapi_0.11       
[58] htmlwidgets_1.5.1      labeling_0.3           bitops_1.0-6          
[61] base64enc_0.1-3        gtable_0.3.0           DBI_1.1.0             
[64] R6_2.4.1               gridExtra_2.3          bdsmatrix_1.3-4       
[67] knitr_1.28             dplyr_0.8.5            bit_1.1-15.2          
[70] Hmisc_4.4-0            stringi_1.4.6          Rcpp_1.0.4.6          
[73] geneplotter_1.64.0     vctrs_0.2.4            rpart_4.1-15          
[76] acepack_1.4.1          png_0.1-7              coda_0.19-3           
[79] tidyselect_1.0.0       xfun_0.13             

Last updated