DGE and DTE analysis of Salmon/Kallisto inputs using Sleuth
Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided
DGE analysis
cd /home/USER/SSAPs/analysis/dge
R
Load required libraries
library(data.table)
library(sleuth)
Import clinical data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
Construct t2g
data.frame
t2g
data.frameLoad the annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g
in the specified format by Sleuth
annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))
t2g = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
target_id ens_gene ext_gene
1 ENST00000456328.2 ENSG00000223972.5 DDX11L1
2 ENST00000450305.2 ENSG00000223972.5 DDX11L1
3 ENST00000488147.1 ENSG00000227232.5 WASH7P
4 ENST00000619216.1 ENSG00000278267.1 MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG
Construct s2c
data.frame
s2c
data.frame Salmon
files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455"
ERR2675458 ERR2675459
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459"
ERR2675460 ERR2675461
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461"
ERR2675464 ERR2675465
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465"
ERR2675468 ERR2675469
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469"
ERR2675472 ERR2675473
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473"
ERR2675476 ERR2675477
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477"
ERR2675478 ERR2675479
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479"
ERR2675480 ERR2675481
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481"
ERR2675484 ERR2675485
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"
Create a sample_to_covariates data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455 normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459 normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461 normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465 normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469 normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473 normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477 normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479 normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481 normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485 normal ../../salmon-bs/ERR2675485
> levels(s2c$condition)
[1] "normal" "0-IIa"
Kallisto
files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455"
ERR2675458 ERR2675459
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459"
ERR2675460 ERR2675461
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461"
ERR2675464 ERR2675465
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465"
ERR2675468 ERR2675469
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469"
ERR2675472 ERR2675473
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473"
ERR2675476 ERR2675477
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477"
ERR2675478 ERR2675479
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479"
ERR2675480 ERR2675481
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481"
ERR2675484 ERR2675485
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485"
Create a sample_to_covariates data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455 normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459 normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461 normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465 normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469 normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473 normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477 normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479 normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481 normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485 normal ../../kallisto/ERR2675485
Construct Sleuth object
Construct the Sleuth object so
with sleuth_prep
. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information
Use aggregation_column = "ens_gene"
to allow summarize the data on the gene level and gene_mode = TRUE
to do counts aggregation at the gene level for normalization, transformation, and downstream steps.
so = sleuth_prep(s2c, ~ condition, target_mapping = t2g,
aggregation_column = "ens_gene", gene_mode = TRUE,
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE,
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
aggregating by column: ens_gene
17692 genes passed the filter
summarizing bootstraps
....................
Differential expression analysis
The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full"
, then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model
so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")
The models that have been fit can be examined with the models()
function.
> models(so)
[ full ]
formula: ~condition
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
condition0-IIa
[ reduced ]
formula: ~1
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
Retrieve results of the test
res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
target_id ext_gene pval qval test_stat rss degrees_free mean_obs var_obs tech_var sigma_sq smooth_sigma_sq final_sigma_sq
1 ENSG00000134193.15 REG4 7.752172e-15 4.571714e-11 60.39724 119.68635 1 9.429405 6.299281 0.0005691108 6.298712 0.11357166 6.298712
2 ENSG00000062038.14 CDH3 7.599095e-15 4.571714e-11 60.43649 116.97036 1 2.960657 6.156335 0.1020903488 6.054245 0.28391752 6.054245
3 ENSG00000229637.4 PRAC2 4.337807e-15 4.571714e-11 61.54027 83.68869 1 1.457273 4.404668 0.0477239422 4.356944 0.45623196 4.356944
4 ENSG00000120875.9 DUSP4 2.600851e-14 1.150356e-10 58.01531 68.69969 1 5.183719 3.615773 0.0175501251 3.598223 0.11098455 3.598223
5 ENSG00000181577.16 C6orf223 4.235939e-14 1.498845e-10 57.05592 118.50929 1 3.753971 6.237331 0.1957375093 6.041594 0.19989544 6.041594
6 ENSG00000196188.11 CTSE 1.287869e-13 2.488331e-10 54.86977 81.91557 1 7.031097 4.311346 0.0025249259 4.308821 0.09906819 4.308821
Exporting results
We construct a normData
data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData
data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format
tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition),
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))
# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"
# Load gene-level annotation
annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))
normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "GeneID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")]
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DGE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[1:10,1:14]
GeneID GeneSymbol Chromosome Start End Class Strand Length normal 0-IIa ERR2675454 ERR2675455 ERR2675458 ERR2675459
24914 ENSG00000223972.5 DDX11L1 chr1 11869 14409 transcribed_unprocessed_pseudogene + 2540 0.02578694 0.002660157 0.0000000 0.0000000 0.000000 0.000000
27153 ENSG00000227232.5 WASH7P chr1 14404 29570 unprocessed_pseudogene - 15166 9.64654711 7.418128917 5.0318657 7.0457119 5.043944 11.669079
54425 ENSG00000278267.1 MIR6859-1 chr1 17369 17436 miRNA - 67 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
36601 ENSG00000243485.5 MIR1302-2HG chr1 29554 31109 lncRNA + 1555 0.01174571 0.000000000 0.0000000 0.1174571 0.000000 0.000000
57122 ENSG00000284332.1 MIR1302-2 chr1 30366 30503 miRNA + 137 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
34149 ENSG00000237613.2 FAM138A chr1 34554 36081 lncRNA - 1527 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
49448 ENSG00000268020.3 OR4G4P chr1 52473 53312 unprocessed_pseudogene + 839 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
35334 ENSG00000240361.2 OR4G11P chr1 57598 64116 transcribed_unprocessed_pseudogene + 6518 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
16136 ENSG00000186092.6 OR4F5 chr1 65419 71585 protein_coding + 6166 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
34420 ENSG00000238009.6 AL627309.1 chr1 89295 133723 lncRNA - 44428 0.37463259 0.356671874 0.1921258 0.5975709 0.389049 0.307692
> deData[1:10,]
GeneID GeneSymbol Chromosome Start End Class Strand Length baseMean log2fc pvalue padj
24914 ENSG00000223972.5 DDX11L1 chr1 11869 14409 transcribed_unprocessed_pseudogene + 2540 NA -3.27705754 NA NA
27153 ENSG00000227232.5 WASH7P chr1 14404 29570 unprocessed_pseudogene - 15166 5.289557 -0.37895730 0.1989157 0.2871868
54425 ENSG00000278267.1 MIR6859-1 chr1 17369 17436 miRNA - 67 NA NA NA NA
36601 ENSG00000243485.5 MIR1302-2HG chr1 29554 31109 lncRNA + 1555 NA NA NA NA
57122 ENSG00000284332.1 MIR1302-2 chr1 30366 30503 miRNA + 137 NA NA NA NA
34149 ENSG00000237613.2 FAM138A chr1 34554 36081 lncRNA - 1527 NA NA NA NA
49448 ENSG00000268020.3 OR4G4P chr1 52473 53312 unprocessed_pseudogene + 839 NA NA NA NA
35334 ENSG00000240361.2 OR4G11P chr1 57598 64116 transcribed_unprocessed_pseudogene + 6518 NA NA NA NA
16136 ENSG00000186092.6 OR4F5 chr1 65419 71585 protein_coding + 6166 NA NA NA NA
34420 ENSG00000238009.6 AL627309.1 chr1 89295 133723 lncRNA - 44428 1.263687 -0.07087897 0.2654303 0.3487556
Exploring results
Principal component plot of the samples
png("DGE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()
Plot sample heatmap using the Jensen-Shannon divergence
png("DGE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE)
dev.off()
Lower divergence values represent samples that are more similar to each other
Plot clustered transcript heatmap
Create expression heatmap for top 40 genes with lowest q-values
png("DGE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40]) # top40
dev.off()
Plot the normalized bootstraps across all samples
Plot expression with bootstrap variation for selected gene
png("DGE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENSG00000062038.14",
units = "tpm", color_by = "condition")
dev.off()
Session info
> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] sleuth_0.30.0 data.table_1.12.8
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4.6 RColorBrewer_1.1-2 pillar_1.4.4
[4] compiler_3.6.3 plyr_1.8.6 tools_3.6.3
[7] digest_0.6.25 lifecycle_0.2.0 tibble_3.0.1
[10] gtable_0.3.0 rhdf5_2.30.1 pkgconfig_2.0.3
[13] rlang_0.4.6 cli_2.0.2 parallel_3.6.3
[16] gridExtra_2.3 dplyr_0.8.5 stringr_1.4.0
[19] vctrs_0.3.0 grid_3.6.3 tidyselect_1.1.0
[22] glue_1.4.1 R6_2.4.1 fansi_0.4.1
[25] pheatmap_1.0.12 ggplot2_3.3.0 Rhdf5lib_1.8.0
[28] purrr_0.3.4 tidyr_1.0.3 reshape2_1.4.4
[31] farver_2.0.3 magrittr_1.5 scales_1.1.1
[34] ellipsis_0.3.0 assertthat_0.2.1 colorspace_1.4-1
[37] labeling_0.3 stringi_1.4.6 lazyeval_0.2.2
[40] munsell_0.5.0 aggregation_1.0.1 crayon_1.3.4
DTE analysis
cd /home/USER/SSAPs/analysis/dte
R
Load required libraries
library(data.table)
library(sleuth)
Import clinical data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
Construct t2g
data.frame
t2g
data.frameLoad the annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g
in the specified format by Sleuth
annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))
t2g = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
target_id ens_gene ext_gene
1 ENST00000456328.2 ENSG00000223972.5 DDX11L1
2 ENST00000450305.2 ENSG00000223972.5 DDX11L1
3 ENST00000488147.1 ENSG00000227232.5 WASH7P
4 ENST00000619216.1 ENSG00000278267.1 MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG
Construct s2c
data.frame
s2c
data.frame Salmon
files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455"
ERR2675458 ERR2675459
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459"
ERR2675460 ERR2675461
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461"
ERR2675464 ERR2675465
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465"
ERR2675468 ERR2675469
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469"
ERR2675472 ERR2675473
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473"
ERR2675476 ERR2675477
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477"
ERR2675478 ERR2675479
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479"
ERR2675480 ERR2675481
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481"
ERR2675484 ERR2675485
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"
Create a data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455 normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459 normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461 normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465 normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469 normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473 normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477 normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479 normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481 normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485 normal ../../salmon-bs/ERR2675485
> levels(s2c$condition)
[1] "normal" "0-IIa"
Kallisto
files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455"
ERR2675458 ERR2675459
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459"
ERR2675460 ERR2675461
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461"
ERR2675464 ERR2675465
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465"
ERR2675468 ERR2675469
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469"
ERR2675472 ERR2675473
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473"
ERR2675476 ERR2675477
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477"
ERR2675478 ERR2675479
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479"
ERR2675480 ERR2675481
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481"
ERR2675484 ERR2675485
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485"
Create a sample_to_covariates data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455 normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459 normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461 normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465 normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469 normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473 normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477 normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479 normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481 normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485 normal ../../kallisto/ERR2675485
Construct Sleuth object
Construct the Sleuth object so
with sleuth_prep
. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information.
so = sleuth_prep(s2c, ~ condition,
target_mapping = t2g,
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE,
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
summarizing bootstraps
....................
Differential expression analysis
The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full"
, then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model
so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")
The models that have been fit can be examined with the models()
function.
> models(so)
[ full ]
formula: ~condition
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
condition0-IIa
[ reduced ]
formula: ~1
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
Retrieve results of the test
res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
target_id ens_gene ext_gene pval qval test_stat rss degrees_free mean_obs var_obs tech_var sigma_sq smooth_sigma_sq final_sigma_sq
1 ENST00000256585.10 ENSG00000134193.15 REG4 7.809240e-15 2.011816e-10 60.38280 119.93926 1 9.275326 6.312593 0.0006011685 6.311992 0.1057915 6.311992
2 ENST00000369401.4 ENSG00000134193.15 REG4 4.500564e-15 2.011816e-10 61.46774 109.01207 1 5.515326 5.737477 0.0265451861 5.710932 0.2012956 5.710932
3 ENST00000240100.7 ENSG00000120875.9 DUSP4 3.638438e-14 6.248897e-10 57.35496 68.99341 1 5.429044 3.631232 0.0179548375 3.613277 0.2110772 3.613277
4 ENST00000293599.7 ENSG00000161798.7 AQP5 8.505383e-14 1.095578e-09 55.68526 233.22374 1 3.003379 12.274934 0.0830957976 12.191838 0.8045670 12.191838
5 ENST00000598581.3 ENSG00000268104.3 SLC6A14 1.214757e-13 1.251782e-09 54.98464 125.03073 1 4.395581 6.580565 0.1085976744 6.471967 0.3933200 6.471967
6 ENST00000257264.4 ENSG00000134827.8 TCN1 3.746770e-13 3.217476e-09 52.77162 169.30624 1 2.718831 8.910855 0.2021942715 8.708661 0.8876592 8.708661
Exporting results
We construct a normData
data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData
data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format
tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition),
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))
# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"
normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "TranscriptID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")] # rearrange columns
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'TranscriptID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DTE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DTE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[normData$GeneSymbol == "ZIC2", 1:14]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length normal 0-IIa ERR2675454 ERR2675455
25392 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 0.05172293 4.70682916 7.6582729 0
93049 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 0.00000000 0.01930897 0.1930897 0
75590 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 0.08067398 0.00000000 0.0000000 0
82745 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 0.01601589 0.18323929 0.0000000 0
86234 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 0.00000000 0.00000000 0.0000000 0
> deData[deData$GeneSymbol == "ZIC2",]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length baseMean log2fc pvalue padj
25392 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 3.022651 6.507808 3.409937e-12 0.00000001098085
93049 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 NA NA NA NA
75590 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 NA NA NA NA
82745 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 NA 3.516153 NA NA
86234 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 NA NA NA NA
Exploring results
Principal component plot of the samples
png("DTE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()
Plot sample heatmap using the Jensen-Shannon divergence
png("DTE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE, units = "tpm")
dev.off()
Lower divergence values represent samples that are more similar to each other
Plot clustered transcript heatmap
Create expression heatmap for top 40 transcripts with lowest q-values
png("DTE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, units = "tpm", cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40])
dev.off()
Plot the normalized bootstraps across all samples
Plot expression with bootstrap variation for selected transcript
png("DTE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENST00000267294.4",
units = "tpm", color_by = "condition")
dev.off()
Session info
> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] sleuth_0.30.0 data.table_1.12.8
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4.6 magrittr_1.5 tidyselect_1.1.0
[4] munsell_0.5.0 colorspace_1.4-1 R6_2.4.1
[7] rlang_0.4.6 stringr_1.4.0 plyr_1.8.6
[10] dplyr_0.8.5 tools_3.6.3 parallel_3.6.3
[13] grid_3.6.3 rhdf5_2.30.1 gtable_0.3.0
[16] ellipsis_0.3.0 digest_0.6.25 lazyeval_0.2.2
[19] assertthat_0.2.1 tibble_3.0.1 lifecycle_0.2.0
[22] crayon_1.3.4 gridExtra_2.3 RColorBrewer_1.1-2
[25] farver_2.0.3 reshape2_1.4.4 tidyr_1.0.3
[28] purrr_0.3.4 Rhdf5lib_1.8.0 ggplot2_3.3.0
[31] vctrs_0.3.0 glue_1.4.1 labeling_0.3
[34] pheatmap_1.0.12 stringi_1.4.6 compiler_3.6.3
[37] pillar_1.4.4 scales_1.1.1 pkgconfig_2.0.3
Last updated
Was this helpful?