DGE and DTE analysis of Salmon/Kallisto inputs using Sleuth
Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided
DGE analysis
cd /home/USER/SSAPs/analysis/dge
R
Load required libraries
library(data.table)
library(sleuth)
Import clinical data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
Construct t2g
data.frame
t2g
data.frameLoad the annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g
in the specified format by Sleuth
annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))
t2g = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
target_id ens_gene ext_gene
1 ENST00000456328.2 ENSG00000223972.5 DDX11L1
2 ENST00000450305.2 ENSG00000223972.5 DDX11L1
3 ENST00000488147.1 ENSG00000227232.5 WASH7P
4 ENST00000619216.1 ENSG00000278267.1 MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG
Construct s2c
data.frame
s2c
data.frame Salmon
files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455"
ERR2675458 ERR2675459
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459"
ERR2675460 ERR2675461
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461"
ERR2675464 ERR2675465
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465"
ERR2675468 ERR2675469
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469"
ERR2675472 ERR2675473
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473"
ERR2675476 ERR2675477
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477"
ERR2675478 ERR2675479
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479"
ERR2675480 ERR2675481
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481"
ERR2675484 ERR2675485
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"
Create a sample_to_covariates data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455 normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459 normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461 normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465 normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469 normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473 normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477 normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479 normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481 normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485 normal ../../salmon-bs/ERR2675485
Sleuth will compare all conditions against the reference level (i.e. first level in the factor variable). Use the levels()
function to check the order of the condition and userelevel()
function to reassign reference level if necessary
> levels(s2c$condition)
[1] "normal" "0-IIa"
Kallisto
files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455"
ERR2675458 ERR2675459
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459"
ERR2675460 ERR2675461
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461"
ERR2675464 ERR2675465
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465"
ERR2675468 ERR2675469
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469"
ERR2675472 ERR2675473
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473"
ERR2675476 ERR2675477
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477"
ERR2675478 ERR2675479
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479"
ERR2675480 ERR2675481
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481"
ERR2675484 ERR2675485
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485"
Create a sample_to_covariates data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455 normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459 normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461 normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465 normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469 normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473 normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477 normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479 normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481 normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485 normal ../../kallisto/ERR2675485
Construct Sleuth object
Construct the Sleuth object so
with sleuth_prep
. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information
Use aggregation_column = "ens_gene"
to allow summarize the data on the gene level and gene_mode = TRUE
to do counts aggregation at the gene level for normalization, transformation, and downstream steps.
The default transformation of counts is natural log, i.e. log(x+0.5). We change this to log base 2 by assigning an appropriate transformation_function
so = sleuth_prep(s2c, ~ condition, target_mapping = t2g,
aggregation_column = "ens_gene", gene_mode = TRUE,
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE,
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
aggregating by column: ens_gene
17692 genes passed the filter
summarizing bootstraps
....................
Differential expression analysis
The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full"
, then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model
so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")
The models that have been fit can be examined with the models()
function.
> models(so)
[ full ]
formula: ~condition
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
condition0-IIa
[ reduced ]
formula: ~1
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
Retrieve results of the test
res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
target_id ext_gene pval qval test_stat rss degrees_free mean_obs var_obs tech_var sigma_sq smooth_sigma_sq final_sigma_sq
1 ENSG00000134193.15 REG4 7.752172e-15 4.571714e-11 60.39724 119.68635 1 9.429405 6.299281 0.0005691108 6.298712 0.11357166 6.298712
2 ENSG00000062038.14 CDH3 7.599095e-15 4.571714e-11 60.43649 116.97036 1 2.960657 6.156335 0.1020903488 6.054245 0.28391752 6.054245
3 ENSG00000229637.4 PRAC2 4.337807e-15 4.571714e-11 61.54027 83.68869 1 1.457273 4.404668 0.0477239422 4.356944 0.45623196 4.356944
4 ENSG00000120875.9 DUSP4 2.600851e-14 1.150356e-10 58.01531 68.69969 1 5.183719 3.615773 0.0175501251 3.598223 0.11098455 3.598223
5 ENSG00000181577.16 C6orf223 4.235939e-14 1.498845e-10 57.05592 118.50929 1 3.753971 6.237331 0.1957375093 6.041594 0.19989544 6.041594
6 ENSG00000196188.11 CTSE 1.287869e-13 2.488331e-10 54.86977 81.91557 1 7.031097 4.311346 0.0025249259 4.308821 0.09906819 4.308821
Exporting results
We construct a normData
data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData
data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format
tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition),
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))
# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"
# Load gene-level annotation
annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))
normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "GeneID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")]
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DGE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[1:10,1:14]
GeneID GeneSymbol Chromosome Start End Class Strand Length normal 0-IIa ERR2675454 ERR2675455 ERR2675458 ERR2675459
24914 ENSG00000223972.5 DDX11L1 chr1 11869 14409 transcribed_unprocessed_pseudogene + 2540 0.02578694 0.002660157 0.0000000 0.0000000 0.000000 0.000000
27153 ENSG00000227232.5 WASH7P chr1 14404 29570 unprocessed_pseudogene - 15166 9.64654711 7.418128917 5.0318657 7.0457119 5.043944 11.669079
54425 ENSG00000278267.1 MIR6859-1 chr1 17369 17436 miRNA - 67 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
36601 ENSG00000243485.5 MIR1302-2HG chr1 29554 31109 lncRNA + 1555 0.01174571 0.000000000 0.0000000 0.1174571 0.000000 0.000000
57122 ENSG00000284332.1 MIR1302-2 chr1 30366 30503 miRNA + 137 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
34149 ENSG00000237613.2 FAM138A chr1 34554 36081 lncRNA - 1527 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
49448 ENSG00000268020.3 OR4G4P chr1 52473 53312 unprocessed_pseudogene + 839 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
35334 ENSG00000240361.2 OR4G11P chr1 57598 64116 transcribed_unprocessed_pseudogene + 6518 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
16136 ENSG00000186092.6 OR4F5 chr1 65419 71585 protein_coding + 6166 0.00000000 0.000000000 0.0000000 0.0000000 0.000000 0.000000
34420 ENSG00000238009.6 AL627309.1 chr1 89295 133723 lncRNA - 44428 0.37463259 0.356671874 0.1921258 0.5975709 0.389049 0.307692
> deData[1:10,]
GeneID GeneSymbol Chromosome Start End Class Strand Length baseMean log2fc pvalue padj
24914 ENSG00000223972.5 DDX11L1 chr1 11869 14409 transcribed_unprocessed_pseudogene + 2540 NA -3.27705754 NA NA
27153 ENSG00000227232.5 WASH7P chr1 14404 29570 unprocessed_pseudogene - 15166 5.289557 -0.37895730 0.1989157 0.2871868
54425 ENSG00000278267.1 MIR6859-1 chr1 17369 17436 miRNA - 67 NA NA NA NA
36601 ENSG00000243485.5 MIR1302-2HG chr1 29554 31109 lncRNA + 1555 NA NA NA NA
57122 ENSG00000284332.1 MIR1302-2 chr1 30366 30503 miRNA + 137 NA NA NA NA
34149 ENSG00000237613.2 FAM138A chr1 34554 36081 lncRNA - 1527 NA NA NA NA
49448 ENSG00000268020.3 OR4G4P chr1 52473 53312 unprocessed_pseudogene + 839 NA NA NA NA
35334 ENSG00000240361.2 OR4G11P chr1 57598 64116 transcribed_unprocessed_pseudogene + 6518 NA NA NA NA
16136 ENSG00000186092.6 OR4F5 chr1 65419 71585 protein_coding + 6166 NA NA NA NA
34420 ENSG00000238009.6 AL627309.1 chr1 89295 133723 lncRNA - 44428 1.263687 -0.07087897 0.2654303 0.3487556
Exploring results
Principal component plot of the samples
png("DGE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()
Plot sample heatmap using the Jensen-Shannon divergence
png("DGE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE)
dev.off()
Lower divergence values represent samples that are more similar to each other
Plot clustered transcript heatmap
Create expression heatmap for top 40 genes with lowest q-values
png("DGE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40]) # top40
dev.off()
Plot the normalized bootstraps across all samples
Plot expression with bootstrap variation for selected gene
png("DGE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENSG00000062038.14",
units = "tpm", color_by = "condition")
dev.off()
Session info
> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] sleuth_0.30.0 data.table_1.12.8
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4.6 RColorBrewer_1.1-2 pillar_1.4.4
[4] compiler_3.6.3 plyr_1.8.6 tools_3.6.3
[7] digest_0.6.25 lifecycle_0.2.0 tibble_3.0.1
[10] gtable_0.3.0 rhdf5_2.30.1 pkgconfig_2.0.3
[13] rlang_0.4.6 cli_2.0.2 parallel_3.6.3
[16] gridExtra_2.3 dplyr_0.8.5 stringr_1.4.0
[19] vctrs_0.3.0 grid_3.6.3 tidyselect_1.1.0
[22] glue_1.4.1 R6_2.4.1 fansi_0.4.1
[25] pheatmap_1.0.12 ggplot2_3.3.0 Rhdf5lib_1.8.0
[28] purrr_0.3.4 tidyr_1.0.3 reshape2_1.4.4
[31] farver_2.0.3 magrittr_1.5 scales_1.1.1
[34] ellipsis_0.3.0 assertthat_0.2.1 colorspace_1.4-1
[37] labeling_0.3 stringi_1.4.6 lazyeval_0.2.2
[40] munsell_0.5.0 aggregation_1.0.1 crayon_1.3.4
DTE analysis
cd /home/USER/SSAPs/analysis/dte
R
Load required libraries
library(data.table)
library(sleuth)
Import clinical data
Load clinical information and define file path
dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)
# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")
Construct t2g
data.frame
t2g
data.frameLoad the annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g
in the specified format by Sleuth
annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))
t2g = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
target_id ens_gene ext_gene
1 ENST00000456328.2 ENSG00000223972.5 DDX11L1
2 ENST00000450305.2 ENSG00000223972.5 DDX11L1
3 ENST00000488147.1 ENSG00000227232.5 WASH7P
4 ENST00000619216.1 ENSG00000278267.1 MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG
Construct s2c
data.frame
s2c
data.frame Salmon
files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455"
ERR2675458 ERR2675459
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459"
ERR2675460 ERR2675461
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461"
ERR2675464 ERR2675465
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465"
ERR2675468 ERR2675469
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469"
ERR2675472 ERR2675473
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473"
ERR2675476 ERR2675477
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477"
ERR2675478 ERR2675479
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479"
ERR2675480 ERR2675481
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481"
ERR2675484 ERR2675485
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"
Create a data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455 normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459 normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461 normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465 normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469 normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473 normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477 normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479 normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481 normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485 normal ../../salmon-bs/ERR2675485
Sleuth will compare all conditions against the reference level (i.e. first level in the factor variable). Use the levels()
function to check the order of the condition and userelevel()
function to reassign reference level if necessary
> levels(s2c$condition)
[1] "normal" "0-IIa"
Kallisto
files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
ERR2675454 ERR2675455
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455"
ERR2675458 ERR2675459
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459"
ERR2675460 ERR2675461
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461"
ERR2675464 ERR2675465
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465"
ERR2675468 ERR2675469
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469"
ERR2675472 ERR2675473
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473"
ERR2675476 ERR2675477
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477"
ERR2675478 ERR2675479
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479"
ERR2675480 ERR2675481
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481"
ERR2675484 ERR2675485
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485"
Create a sample_to_covariates data.frame s2c
that contains the required information in the specified format by Sleuth
s2c = data.frame(sample = sampleData$ENA_RUN,
condition = sampleData$paris_classification,
path = files, stringsAsFactors = FALSE)
> s2c
sample condition path
ERR2675454 ERR2675454 0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455 normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458 0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459 normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460 0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461 normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464 0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465 normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468 0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469 normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472 0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473 normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476 0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477 normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478 0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479 normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480 0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481 normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484 0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485 normal ../../kallisto/ERR2675485
Construct Sleuth object
Construct the Sleuth object so
with sleuth_prep
. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information.
The default transformation of counts is natural log, i.e. log(x+0.5). We change this to log base 2 by assigning an appropriate transformation_function
so = sleuth_prep(s2c, ~ condition,
target_mapping = t2g,
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE,
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
summarizing bootstraps
....................
Differential expression analysis
The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full"
, then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model
so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")
The models that have been fit can be examined with the models()
function.
> models(so)
[ full ]
formula: ~condition
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
condition0-IIa
[ reduced ]
formula: ~1
data modeled: obs_counts
transform sync'ed: TRUE
coefficients:
(Intercept)
Retrieve results of the test
res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
target_id ens_gene ext_gene pval qval test_stat rss degrees_free mean_obs var_obs tech_var sigma_sq smooth_sigma_sq final_sigma_sq
1 ENST00000256585.10 ENSG00000134193.15 REG4 7.809240e-15 2.011816e-10 60.38280 119.93926 1 9.275326 6.312593 0.0006011685 6.311992 0.1057915 6.311992
2 ENST00000369401.4 ENSG00000134193.15 REG4 4.500564e-15 2.011816e-10 61.46774 109.01207 1 5.515326 5.737477 0.0265451861 5.710932 0.2012956 5.710932
3 ENST00000240100.7 ENSG00000120875.9 DUSP4 3.638438e-14 6.248897e-10 57.35496 68.99341 1 5.429044 3.631232 0.0179548375 3.613277 0.2110772 3.613277
4 ENST00000293599.7 ENSG00000161798.7 AQP5 8.505383e-14 1.095578e-09 55.68526 233.22374 1 3.003379 12.274934 0.0830957976 12.191838 0.8045670 12.191838
5 ENST00000598581.3 ENSG00000268104.3 SLC6A14 1.214757e-13 1.251782e-09 54.98464 125.03073 1 4.395581 6.580565 0.1085976744 6.471967 0.3933200 6.471967
6 ENST00000257264.4 ENSG00000134827.8 TCN1 3.746770e-13 3.217476e-09 52.77162 169.30624 1 2.718831 8.910855 0.2021942715 8.708661 0.8876592 8.708661
Exporting results
We construct a normData
data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData
data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData
data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format
tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition),
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))
# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"
normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "TranscriptID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]
deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")] # rearrange columns
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'TranscriptID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]
write.table(normData, file="DTE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DTE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[normData$GeneSymbol == "ZIC2", 1:14]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length normal 0-IIa ERR2675454 ERR2675455
25392 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 0.05172293 4.70682916 7.6582729 0
93049 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 0.00000000 0.01930897 0.1930897 0
75590 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 0.08067398 0.00000000 0.0000000 0
82745 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 0.01601589 0.18323929 0.0000000 0
86234 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 0.00000000 0.00000000 0.0000000 0
> deData[deData$GeneSymbol == "ZIC2",]
TranscriptID GeneID GeneSymbol Chromosome Start End TranscriptName Class Strand Length baseMean log2fc pvalue padj
25392 ENST00000376335.8 ENSG00000043355.12 ZIC2 chr13 99981784 99986765 ZIC2-201 protein_coding + 4981 3.022651 6.507808 3.409937e-12 0.00000001098085
93049 ENST00000490085.5 ENSG00000043355.12 ZIC2 chr13 99984289 99985492 ZIC2-205 processed_transcript + 1203 NA NA NA NA
75590 ENST00000468291.1 ENSG00000043355.12 ZIC2 chr13 99984689 99985492 ZIC2-202 processed_transcript + 803 NA NA NA NA
82745 ENST00000477213.1 ENSG00000043355.12 ZIC2 chr13 99984789 99985474 ZIC2-203 processed_transcript + 685 NA 3.516153 NA NA
86234 ENST00000481565.1 ENSG00000043355.12 ZIC2 chr13 99985494 99985904 ZIC2-204 processed_transcript + 410 NA NA NA NA
Exploring results
Principal component plot of the samples
png("DTE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()
Plot sample heatmap using the Jensen-Shannon divergence
png("DTE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE, units = "tpm")
dev.off()
Lower divergence values represent samples that are more similar to each other
Plot clustered transcript heatmap
Create expression heatmap for top 40 transcripts with lowest q-values
png("DTE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, units = "tpm", cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40])
dev.off()
Plot the normalized bootstraps across all samples
Plot expression with bootstrap variation for selected transcript
png("DTE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENST00000267294.4",
units = "tpm", color_by = "condition")
dev.off()
Session info
> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS
Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so
locale:
[1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
[5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
[7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods
[7] base
other attached packages:
[1] sleuth_0.30.0 data.table_1.12.8
loaded via a namespace (and not attached):
[1] Rcpp_1.0.4.6 magrittr_1.5 tidyselect_1.1.0
[4] munsell_0.5.0 colorspace_1.4-1 R6_2.4.1
[7] rlang_0.4.6 stringr_1.4.0 plyr_1.8.6
[10] dplyr_0.8.5 tools_3.6.3 parallel_3.6.3
[13] grid_3.6.3 rhdf5_2.30.1 gtable_0.3.0
[16] ellipsis_0.3.0 digest_0.6.25 lazyeval_0.2.2
[19] assertthat_0.2.1 tibble_3.0.1 lifecycle_0.2.0
[22] crayon_1.3.4 gridExtra_2.3 RColorBrewer_1.1-2
[25] farver_2.0.3 reshape2_1.4.4 tidyr_1.0.3
[28] purrr_0.3.4 Rhdf5lib_1.8.0 ggplot2_3.3.0
[31] vctrs_0.3.0 glue_1.4.1 labeling_0.3
[34] pheatmap_1.0.12 stringi_1.4.6 compiler_3.6.3
[37] pillar_1.4.4 scales_1.1.1 pkgconfig_2.0.3
Last updated
Was this helpful?