DGE analysis with STAR input

Load required libraries

library(data.table)
library(DESeq2)
library(apeglm)
library(ggplot2)
library(ggrepel)
library(EnhancedVolcano)

Import input data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

files = list.files(paste0(dir, "star"), "*ReadsPerGene.out.tab$", full.names = T)
> files
 [1] "../../star/ERR2675454_ReadsPerGene.out.tab"
 [2] "../../star/ERR2675455_ReadsPerGene.out.tab"
 [3] "../../star/ERR2675458_ReadsPerGene.out.tab"
 [4] "../../star/ERR2675459_ReadsPerGene.out.tab"
 [5] "../../star/ERR2675460_ReadsPerGene.out.tab"
 [6] "../../star/ERR2675461_ReadsPerGene.out.tab"
 [7] "../../star/ERR2675464_ReadsPerGene.out.tab"
 [8] "../../star/ERR2675465_ReadsPerGene.out.tab"
 [9] "../../star/ERR2675468_ReadsPerGene.out.tab"
[10] "../../star/ERR2675469_ReadsPerGene.out.tab"
[11] "../../star/ERR2675472_ReadsPerGene.out.tab"
[12] "../../star/ERR2675473_ReadsPerGene.out.tab"
[13] "../../star/ERR2675476_ReadsPerGene.out.tab"
[14] "../../star/ERR2675477_ReadsPerGene.out.tab"
[15] "../../star/ERR2675478_ReadsPerGene.out.tab"
[16] "../../star/ERR2675479_ReadsPerGene.out.tab"
[17] "../../star/ERR2675480_ReadsPerGene.out.tab"
[18] "../../star/ERR2675481_ReadsPerGene.out.tab"
[19] "../../star/ERR2675484_ReadsPerGene.out.tab"
[20] "../../star/ERR2675485_ReadsPerGene.out.tab"

Import STAR count file and keep the 4th column containing the reverse stranded counts. Build a countData data.frame to store counts

countData = data.frame(fread(files[1]))[c(1,4)]

# Loop and read the 4th column remaining files
for(i in 2:length(files)) {
        countData = cbind(countData, data.frame(fread(files[i]))[4])
}

# Skip first 4 lines, count data starts on the 5th line
countData = countData[c(5:nrow(countData)),]
colnames(countData) = c("GeneID", gsub(paste0(dir,"star/"), "", files))
colnames(countData) = gsub("_ReadsPerGene.out.tab", "", colnames(countData))
rownames(countData) = countData$GeneID

countData = countData[,c(2:ncol(countData))]
> countData[1:10,1:5]
                  ERR2675454 ERR2675455 ERR2675458 ERR2675459 ERR2675460
ENSG00000223972.5          0          0          0          0          0
ENSG00000227232.5         32         33         13         70         97
ENSG00000278267.1          0          0          2          3          4
ENSG00000243485.5          0          0          0          0          0
ENSG00000284332.1          0          0          0          0          0
ENSG00000237613.2          0          0          0          0          0
ENSG00000268020.3          0          0          0          0          0
ENSG00000240361.2          0          0          0          0          0
ENSG00000186092.6          0          0          0          0          0
ENSG00000238009.6          0          2          0          0          9

Build a DESeqDataSet from countData with DESeqDataSetFromMatrix, providing also the sample information and a design formula

dds = DESeqDataSetFromMatrix(countData = countData, 
colData = sampleData, design = ~ individual + paris_classification)
> dds
class: DESeqDataSet 
dim: 60662 20 
metadata(1): version
assays(1): counts
rownames(60662): ENSG00000223972.5 ENSG00000227232.5 ... ENSG00000210195.2 ENSG00000210196.2
rowData names(0):
colnames(20): ERR2675454 ERR2675455 ... ERR2675484 ERR2675485
colData names(11): ENA_RUN individual ... braf_status kras_status

Differential expression analysis

Run DESeq2 analysis using DESeq, which performs (1) estimation of size factors, (2) estimation of dispersion, then (3) Negative Binomial GLM fitting and Wald statistics. The results tables (log2 fold changes and p-values) can be generated using the results function

dds = DESeq(dds)
> cbind(resultsNames(dds))
      [,1]                                  
 [1,] "Intercept"                           
 [2,] "individual_S11_vs_S1"                
 [3,] "individual_S12_vs_S1"                
 [4,] "individual_S15_vs_S1"                
 [5,] "individual_S17_vs_S1"                
 [6,] "individual_S3_vs_S1"                 
 [7,] "individual_S5_vs_S1"                 
 [8,] "individual_S6_vs_S1"                 
 [9,] "individual_S7_vs_S1"                 
[10,] "individual_S9_vs_S1"                 
[11,] "paris_classification_0.IIa_vs_normal"

We set the adjusted p-value cutoff (FDR) to be 0.05, hence we change the default significance cutoff used for optimizing the independent filtering alpha from 0.1 to 0.05

res <- results(dds, name = "paris_classification_0.IIa_vs_normal", alpha = 0.05)

At a FDR of 0.05, we have 3129 genes up-regulated and 2916 genes down-regulated in 0-IIa pre-lesion versus Normal

> summary(res)

out of 39903 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 3129, 7.8%
LFC < 0 (down)     : 2916, 7.3%
outliers [1]       : 0, 0%
low counts [2]     : 15801, 40%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

The results res object contains the follow columns

> mcols(res)$description
[1] "mean of normalized counts for all samples"                   
[2] "log2 fold change (MLE): paris classification 0.IIa vs normal"
[3] "standard error: paris classification 0.IIa vs normal"        
[4] "Wald statistic: paris classification 0.IIa vs normal"        
[5] "Wald test p-value: paris classification 0.IIa vs normal"     
[6] "BH adjusted p-values"                                        

> head(res)
log2 fold change (MLE): paris classification 0.IIa vs normal 
Wald test p-value: paris classification 0.IIa vs normal 
DataFrame with 6 rows and 6 columns
                           baseMean      log2FoldChange             lfcSE                stat            pvalue              padj
                          <numeric>           <numeric>         <numeric>           <numeric>         <numeric>         <numeric>
ENSG00000223972.5 0.196142797581091 -0.0510133328786562  3.01727024534903 -0.0169071142889142 0.986510717197277                NA
ENSG00000227232.5  65.2360917381934  -0.148867737983724 0.348724795003796  -0.426891749931643  0.66945817485052 0.816108488708069
ENSG00000278267.1  2.73563541954875   0.668374526475476  0.80813859554197     0.8270543322179 0.408206266889902 0.618298738745137
ENSG00000243485.5                 0                  NA                NA                  NA                NA                NA
ENSG00000284332.1                 0                  NA                NA                  NA                NA                NA
ENSG00000237613.2                 0                  NA                NA                  NA                NA                NA

Read about authors' note on p-values and adjusted p-values set to NA here

Exploring results

MA-plot

We use MA-plot to show the log2 fold changes attributable to a given variable (i.e. pre-lesion) over the mean of normalized counts for all the samples. Points will be colored red if the adjusted p value is less than 0.05. Points which fall out of the window are plotted as open triangles pointing either up or down

resLFC = lfcShrink(dds, coef = "paris_classification_0.IIa_vs_normal", 
type="apeglm")

png("DGE_MA-plot.STAR.png", width=7, height=5, units = "in", res = 300)
plotMA(resLFC, alpha = 0.05, ylim=c(-6,6), 
main = "MA-plot for the shrunken log2 fold changes")
dev.off()

Principal component plot of the samples

First, we perform count data transformation with regularized logarithm rlog or variance stabilizing transformations vst. You can read here about which transformation to choose

rld = rlog(dds)
vsd = vst(dds)

Perform sample PCA for transformed data using plotPCA, then plot with ggplot

# rlog
pcaData = plotPCA(rld, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DGE_PCA-rlog.STAR.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) + theme_bw() + 
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "rlog transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()
# vst
pcaData = plotPCA(vsd, intgroup=c("individual","paris_classification"), 
returnData=TRUE)
percentVar = round(100 * attr(pcaData, "percentVar"))

png("DGE_PCA-vst.STAR.png", width=7, height=7, units = "in", res = 300)
ggplot(pcaData, aes(PC1, PC2, colour = paris_classification)) + 
geom_point(size = 2) + theme_bw() + 
scale_color_manual(values = c("blue", "red")) +
geom_text_repel(aes(label = individual), nudge_x = -1, nudge_y = 0.2, size = 3) +
ggtitle("Principal Component Analysis (PCA)", subtitle = "vst transformation") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance"))
dev.off()

Volcano plots

We use EnhancedVolcano to generate volcano plot to visualise the results of differential expression analyses

pCutoff = 0.05
FCcutoff = 1.0

p = EnhancedVolcano(data.frame(res), lab = NA, x = 'log2FoldChange', y = 'padj',
xlab = bquote(~Log[2]~ 'fold change'), ylab = bquote(~-Log[10]~adjusted~italic(P)),
pCutoff = pCutoff, FCcutoff = FCcutoff, pointSize = 1.0, labSize = 2.0,
title = "Volcano plot", subtitle = "SSA/P vs. Normal",
caption = paste0('log2 FC cutoff: ', FCcutoff, '; p-value cutoff: ', pCutoff, '\nTotal = ', nrow(res), ' variables'),
legend=c('NS','Log2 FC','Adjusted p-value', 'Adjusted p-value & Log2 FC'),
legendPosition = 'bottom', legendLabSize = 14, legendIconSize = 5.0)

png("DGE_VolcanoPlots.STAR.png", width=7, height=7, units = "in", res = 300)
print(p)
dev.off()

Exporting results

We construct a normData data.frame to store per-group normalised mean and normalised counts of all samples, and a deData data.frame to store the DESeq results. We merge both data.frame with a annoData data.frame that contain per-gene annotation derived from GENCODE GTF file before exporting the results in tabular format

annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))

normCounts = as.data.frame(counts(dds, normalized = TRUE))
baseMeans = as.data.frame(sapply( levels(dds$paris_classification), 
function(lvl) rowMeans( counts(dds, normalized = TRUE)[, dds$paris_classification == lvl, drop = FALSE] ) ))

normData = merge(annoData, merge(baseMeans, normCounts, by.x = 'row.names', by.y = 'row.names'), by.x = 'GeneID', by.y = 'Row.names')
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = data.frame(res[,c(1,2,5,6)])
colnames(deData) = c("baseMean","log2fc","pvalue","padj")
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'row.names')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DGE_DESeq2_Means_and_NormalisedCount.STAR.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_DESeq2_DE_results.STAR.txt", 
sep = "\t", quote = F, row.names = F, col.names = T)
> normData[10000:10005,1:12]
                  GeneID GeneSymbol Chromosome    Start      End                Class Strand Length      normal        0-IIa ERR2675454 ERR2675455
9184  ENSG00000149257.14   SERPINH1      chr11 75562056 75572783       protein_coding      +  10727 804.9909532 850.91390781 571.034788  817.82929
42777  ENSG00000255326.1 AP001922.5      chr11 75583196 75594665               lncRNA      +  11469   0.3676455   1.86683323   0.000000    0.00000
13087 ENSG00000171533.11       MAP6      chr11 75586918 75669120       protein_coding      -  82202  11.6539986  11.06950146   3.294431   18.99825
42876  ENSG00000255434.1 AP001922.6      chr11 75596144 75597270               lncRNA      -   1126   0.0000000   0.07527231   0.000000    0.00000
42172  ENSG00000254630.1 AP001922.2      chr11 75635883 75638587               lncRNA      -   2704   0.0000000   0.00000000   0.000000    0.00000
42735  ENSG00000255280.1 AP001922.4      chr11 75642600 75642889 processed_pseudogene      +    289   0.0000000   0.00000000   0.000000    0.00000

> deData[10000:10005,]
                  GeneID GeneSymbol Chromosome    Start      End                Class Strand Length     baseMean       log2fc    pvalue      padj
9184  ENSG00000149257.14   SERPINH1      chr11 75562056 75572783       protein_coding      +  10727 827.95243051  0.081912786 0.6410465 0.7998397
42777  ENSG00000255326.1 AP001922.5      chr11 75583196 75594665               lncRNA      +  11469   1.11723936  1.894908372 0.1279743        NA
13087 ENSG00000171533.11       MAP6      chr11 75586918 75669120       protein_coding      -  82202  11.36175003 -0.004785778 0.9900707 0.9957721
42876  ENSG00000255434.1 AP001922.6      chr11 75596144 75597270               lncRNA      -   1126   0.03763615  0.137222714 0.9638312        NA
42172  ENSG00000254630.1 AP001922.2      chr11 75635883 75638587               lncRNA      -   2704   0.00000000           NA        NA        NA
42735  ENSG00000255280.1 AP001922.4      chr11 75642600 75642889 processed_pseudogene      +    289   0.00000000           NA        NA        NA

Session info

> sessionInfo()
R version 3.6.2 (2019-12-12)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats4    stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] EnhancedVolcano_1.4.0       ggrepel_0.8.2              
 [3] ggplot2_3.3.0               apeglm_1.8.0               
 [5] DESeq2_1.26.0               SummarizedExperiment_1.16.1
 [7] DelayedArray_0.12.3         BiocParallel_1.20.1        
 [9] matrixStats_0.56.0          Biobase_2.46.0             
[11] GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
[13] IRanges_2.20.2              S4Vectors_0.24.4           
[15] BiocGenerics_0.32.0         data.table_1.12.8          

loaded via a namespace (and not attached):
 [1] bit64_0.9-7            splines_3.6.2          Formula_1.2-3         
 [4] assertthat_0.2.1       latticeExtra_0.6-29    blob_1.2.1            
 [7] GenomeInfoDbData_1.2.2 numDeriv_2016.8-1.1    pillar_1.4.4          
[10] RSQLite_2.2.0          backports_1.1.6        lattice_0.20-41       
[13] glue_1.4.0             bbmle_1.0.23.1         digest_0.6.25         
[16] RColorBrewer_1.1-2     XVector_0.26.0         checkmate_2.0.0       
[19] colorspace_1.4-1       plyr_1.8.6             htmltools_0.4.0       
[22] Matrix_1.2-18          XML_3.99-0.3           pkgconfig_2.0.3       
[25] emdbook_1.3.12         genefilter_1.68.0      zlibbioc_1.32.0       
[28] mvtnorm_1.1-0          purrr_0.3.4            xtable_1.8-4          
[31] scales_1.1.0           jpeg_0.1-8.1           htmlTable_1.13.3      
[34] tibble_3.0.1           annotate_1.64.0        farver_2.0.3          
[37] ellipsis_0.3.0         withr_2.2.0            nnet_7.3-14           
[40] survival_3.1-12        magrittr_1.5           crayon_1.3.4          
[43] memoise_1.1.0          MASS_7.3-51.6          foreign_0.8-76        
[46] tools_3.6.2            lifecycle_0.2.0        stringr_1.4.0         
[49] locfit_1.5-9.4         munsell_0.5.0          cluster_2.1.0         
[52] AnnotationDbi_1.48.0   compiler_3.6.2         rlang_0.4.6           
[55] grid_3.6.2             RCurl_1.98-1.2         rstudioapi_0.11       
[58] htmlwidgets_1.5.1      labeling_0.3           bitops_1.0-6          
[61] base64enc_0.1-3        gtable_0.3.0           DBI_1.1.0             
[64] R6_2.4.1               gridExtra_2.3          bdsmatrix_1.3-4       
[67] knitr_1.28             dplyr_0.8.5            bit_1.1-15.2          
[70] Hmisc_4.4-0            stringi_1.4.6          Rcpp_1.0.4.6          
[73] geneplotter_1.64.0     vctrs_0.2.4            rpart_4.1-15          
[76] acepack_1.4.1          png_0.1-7              coda_0.19-3           
[79] tidyselect_1.0.0       xfun_0.13             

Last updated