DGE and DTE analysis of Salmon/Kallisto inputs using Sleuth

Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided

DGE analysis

cd /home/USER/SSAPs/analysis/dge
R

Load required libraries

library(data.table)
library(sleuth)

Import clinical data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

Construct t2g data.frame

Load the annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g in the specified format by Sleuth

annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))

t2g  = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
          target_id          ens_gene    ext_gene
1 ENST00000456328.2 ENSG00000223972.5     DDX11L1
2 ENST00000450305.2 ENSG00000223972.5     DDX11L1
3 ENST00000488147.1 ENSG00000227232.5      WASH7P
4 ENST00000619216.1 ENSG00000278267.1   MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG

Construct s2c data.frame

Salmon

files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                  ERR2675454                   ERR2675455 
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455" 
                  ERR2675458                   ERR2675459 
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459" 
                  ERR2675460                   ERR2675461 
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461" 
                  ERR2675464                   ERR2675465 
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465" 
                  ERR2675468                   ERR2675469 
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469" 
                  ERR2675472                   ERR2675473 
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473" 
                  ERR2675476                   ERR2675477 
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477" 
                  ERR2675478                   ERR2675479 
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479" 
                  ERR2675480                   ERR2675481 
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481" 
                  ERR2675484                   ERR2675485 
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"    

Create a sample_to_covariates data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                       path
ERR2675454 ERR2675454     0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455    normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459    normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461    normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465    normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469    normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473    normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477    normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479    normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481    normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485    normal ../../salmon-bs/ERR2675485
> levels(s2c$condition)
[1] "normal" "0-IIa" 

Kallisto

files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                 ERR2675454                  ERR2675455 
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455" 
                 ERR2675458                  ERR2675459 
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459" 
                 ERR2675460                  ERR2675461 
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461" 
                 ERR2675464                  ERR2675465 
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465" 
                 ERR2675468                  ERR2675469 
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469" 
                 ERR2675472                  ERR2675473 
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473" 
                 ERR2675476                  ERR2675477 
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477" 
                 ERR2675478                  ERR2675479 
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479" 
                 ERR2675480                  ERR2675481 
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481" 
                 ERR2675484                  ERR2675485 
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485" 

Create a sample_to_covariates data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                      path
ERR2675454 ERR2675454     0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455    normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459    normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461    normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465    normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469    normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473    normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477    normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479    normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481    normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485    normal ../../kallisto/ERR2675485

Construct Sleuth object

Construct the Sleuth object so with sleuth_prep. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information

Use aggregation_column = "ens_gene" to allow summarize the data on the gene level and gene_mode = TRUE to do counts aggregation at the gene level for normalization, transformation, and downstream steps.

so = sleuth_prep(s2c, ~ condition, target_mapping = t2g, 
aggregation_column = "ens_gene", gene_mode = TRUE,
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE, 
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
aggregating by column: ens_gene
17692 genes passed the filter
summarizing bootstraps
....................

Differential expression analysis

The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full", then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model

so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")

The models that have been fit can be examined with the models() function.

> models(so)
[  full  ]
formula:  ~condition 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)
 	condition0-IIa
[  reduced  ]
formula:  ~1 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)

Retrieve results of the test

res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
           target_id ext_gene         pval         qval test_stat       rss degrees_free mean_obs  var_obs     tech_var sigma_sq smooth_sigma_sq final_sigma_sq
1 ENSG00000134193.15     REG4 7.752172e-15 4.571714e-11  60.39724 119.68635            1 9.429405 6.299281 0.0005691108 6.298712      0.11357166       6.298712
2 ENSG00000062038.14     CDH3 7.599095e-15 4.571714e-11  60.43649 116.97036            1 2.960657 6.156335 0.1020903488 6.054245      0.28391752       6.054245
3  ENSG00000229637.4    PRAC2 4.337807e-15 4.571714e-11  61.54027  83.68869            1 1.457273 4.404668 0.0477239422 4.356944      0.45623196       4.356944
4  ENSG00000120875.9    DUSP4 2.600851e-14 1.150356e-10  58.01531  68.69969            1 5.183719 3.615773 0.0175501251 3.598223      0.11098455       3.598223
5 ENSG00000181577.16 C6orf223 4.235939e-14 1.498845e-10  57.05592 118.50929            1 3.753971 6.237331 0.1957375093 6.041594      0.19989544       6.041594
6 ENSG00000196188.11     CTSE 1.287869e-13 2.488331e-10  54.86977  81.91557            1 7.031097 4.311346 0.0025249259 4.308821      0.09906819       4.308821

Exporting results

We construct a normData data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format

tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition), 
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))

# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"

# Load gene-level annotation
annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))

normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "GeneID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")]
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DGE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[1:10,1:14]
                 GeneID  GeneSymbol Chromosome Start    End                              Class Strand Length     normal       0-IIa ERR2675454 ERR2675455 ERR2675458 ERR2675459
24914 ENSG00000223972.5     DDX11L1       chr1 11869  14409 transcribed_unprocessed_pseudogene      +   2540 0.02578694 0.002660157  0.0000000  0.0000000   0.000000   0.000000
27153 ENSG00000227232.5      WASH7P       chr1 14404  29570             unprocessed_pseudogene      -  15166 9.64654711 7.418128917  5.0318657  7.0457119   5.043944  11.669079
54425 ENSG00000278267.1   MIR6859-1       chr1 17369  17436                              miRNA      -     67 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
36601 ENSG00000243485.5 MIR1302-2HG       chr1 29554  31109                             lncRNA      +   1555 0.01174571 0.000000000  0.0000000  0.1174571   0.000000   0.000000
57122 ENSG00000284332.1   MIR1302-2       chr1 30366  30503                              miRNA      +    137 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
34149 ENSG00000237613.2     FAM138A       chr1 34554  36081                             lncRNA      -   1527 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
49448 ENSG00000268020.3      OR4G4P       chr1 52473  53312             unprocessed_pseudogene      +    839 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
35334 ENSG00000240361.2     OR4G11P       chr1 57598  64116 transcribed_unprocessed_pseudogene      +   6518 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
16136 ENSG00000186092.6       OR4F5       chr1 65419  71585                     protein_coding      +   6166 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
34420 ENSG00000238009.6  AL627309.1       chr1 89295 133723                             lncRNA      -  44428 0.37463259 0.356671874  0.1921258  0.5975709   0.389049   0.307692

> deData[1:10,]
                 GeneID  GeneSymbol Chromosome Start    End                              Class Strand Length baseMean      log2fc    pvalue      padj
24914 ENSG00000223972.5     DDX11L1       chr1 11869  14409 transcribed_unprocessed_pseudogene      +   2540       NA -3.27705754        NA        NA
27153 ENSG00000227232.5      WASH7P       chr1 14404  29570             unprocessed_pseudogene      -  15166 5.289557 -0.37895730 0.1989157 0.2871868
54425 ENSG00000278267.1   MIR6859-1       chr1 17369  17436                              miRNA      -     67       NA          NA        NA        NA
36601 ENSG00000243485.5 MIR1302-2HG       chr1 29554  31109                             lncRNA      +   1555       NA          NA        NA        NA
57122 ENSG00000284332.1   MIR1302-2       chr1 30366  30503                              miRNA      +    137       NA          NA        NA        NA
34149 ENSG00000237613.2     FAM138A       chr1 34554  36081                             lncRNA      -   1527       NA          NA        NA        NA
49448 ENSG00000268020.3      OR4G4P       chr1 52473  53312             unprocessed_pseudogene      +    839       NA          NA        NA        NA
35334 ENSG00000240361.2     OR4G11P       chr1 57598  64116 transcribed_unprocessed_pseudogene      +   6518       NA          NA        NA        NA
16136 ENSG00000186092.6       OR4F5       chr1 65419  71585                     protein_coding      +   6166       NA          NA        NA        NA
34420 ENSG00000238009.6  AL627309.1       chr1 89295 133723                             lncRNA      -  44428 1.263687 -0.07087897 0.2654303 0.3487556

Exploring results

Principal component plot of the samples

png("DGE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()

Plot sample heatmap using the Jensen-Shannon divergence

png("DGE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE)
dev.off()

Lower divergence values represent samples that are more similar to each other

Plot clustered transcript heatmap

Create expression heatmap for top 40 genes with lowest q-values

png("DGE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40]) # top40
dev.off()

Plot the normalized bootstraps across all samples

Plot expression with bootstrap variation for selected gene

png("DGE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENSG00000062038.14", 
units = "tpm", color_by = "condition")
dev.off()

Session info

> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
[1] sleuth_0.30.0     data.table_1.12.8

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6       RColorBrewer_1.1-2 pillar_1.4.4      
 [4] compiler_3.6.3     plyr_1.8.6         tools_3.6.3       
 [7] digest_0.6.25      lifecycle_0.2.0    tibble_3.0.1      
[10] gtable_0.3.0       rhdf5_2.30.1       pkgconfig_2.0.3   
[13] rlang_0.4.6        cli_2.0.2          parallel_3.6.3    
[16] gridExtra_2.3      dplyr_0.8.5        stringr_1.4.0     
[19] vctrs_0.3.0        grid_3.6.3         tidyselect_1.1.0  
[22] glue_1.4.1         R6_2.4.1           fansi_0.4.1       
[25] pheatmap_1.0.12    ggplot2_3.3.0      Rhdf5lib_1.8.0    
[28] purrr_0.3.4        tidyr_1.0.3        reshape2_1.4.4    
[31] farver_2.0.3       magrittr_1.5       scales_1.1.1      
[34] ellipsis_0.3.0     assertthat_0.2.1   colorspace_1.4-1  
[37] labeling_0.3       stringi_1.4.6      lazyeval_0.2.2    
[40] munsell_0.5.0      aggregation_1.0.1  crayon_1.3.4      

DTE analysis

cd /home/USER/SSAPs/analysis/dte
R

Load required libraries

library(data.table)
library(sleuth)

Import clinical data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

Construct t2g data.frame

Load the annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g in the specified format by Sleuth

annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))

t2g  = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
          target_id          ens_gene    ext_gene
1 ENST00000456328.2 ENSG00000223972.5     DDX11L1
2 ENST00000450305.2 ENSG00000223972.5     DDX11L1
3 ENST00000488147.1 ENSG00000227232.5      WASH7P
4 ENST00000619216.1 ENSG00000278267.1   MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG

Construct s2c data.frame

Salmon

files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                  ERR2675454                   ERR2675455 
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455" 
                  ERR2675458                   ERR2675459 
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459" 
                  ERR2675460                   ERR2675461 
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461" 
                  ERR2675464                   ERR2675465 
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465" 
                  ERR2675468                   ERR2675469 
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469" 
                  ERR2675472                   ERR2675473 
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473" 
                  ERR2675476                   ERR2675477 
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477" 
                  ERR2675478                   ERR2675479 
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479" 
                  ERR2675480                   ERR2675481 
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481" 
                  ERR2675484                   ERR2675485 
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"    

Create a data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                       path
ERR2675454 ERR2675454     0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455    normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459    normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461    normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465    normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469    normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473    normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477    normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479    normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481    normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485    normal ../../salmon-bs/ERR2675485
> levels(s2c$condition)
[1] "normal" "0-IIa" 

Kallisto

files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                 ERR2675454                  ERR2675455 
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455" 
                 ERR2675458                  ERR2675459 
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459" 
                 ERR2675460                  ERR2675461 
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461" 
                 ERR2675464                  ERR2675465 
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465" 
                 ERR2675468                  ERR2675469 
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469" 
                 ERR2675472                  ERR2675473 
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473" 
                 ERR2675476                  ERR2675477 
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477" 
                 ERR2675478                  ERR2675479 
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479" 
                 ERR2675480                  ERR2675481 
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481" 
                 ERR2675484                  ERR2675485 
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485" 

Create a sample_to_covariates data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                      path
ERR2675454 ERR2675454     0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455    normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459    normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461    normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465    normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469    normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473    normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477    normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479    normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481    normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485    normal ../../kallisto/ERR2675485

Construct Sleuth object

Construct the Sleuth object so with sleuth_prep. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information.

so = sleuth_prep(s2c, ~ condition, 
target_mapping = t2g, 
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE, 
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
summarizing bootstraps
....................

Differential expression analysis

The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full", then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model

so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")

The models that have been fit can be examined with the models() function.

> models(so)
[  full  ]
formula:  ~condition 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)
 	condition0-IIa
[  reduced  ]
formula:  ~1 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)

Retrieve results of the test

res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
           target_id           ens_gene ext_gene         pval         qval test_stat       rss degrees_free mean_obs   var_obs     tech_var  sigma_sq smooth_sigma_sq final_sigma_sq
1 ENST00000256585.10 ENSG00000134193.15     REG4 7.809240e-15 2.011816e-10  60.38280 119.93926            1 9.275326  6.312593 0.0006011685  6.311992       0.1057915       6.311992
2  ENST00000369401.4 ENSG00000134193.15     REG4 4.500564e-15 2.011816e-10  61.46774 109.01207            1 5.515326  5.737477 0.0265451861  5.710932       0.2012956       5.710932
3  ENST00000240100.7  ENSG00000120875.9    DUSP4 3.638438e-14 6.248897e-10  57.35496  68.99341            1 5.429044  3.631232 0.0179548375  3.613277       0.2110772       3.613277
4  ENST00000293599.7  ENSG00000161798.7     AQP5 8.505383e-14 1.095578e-09  55.68526 233.22374            1 3.003379 12.274934 0.0830957976 12.191838       0.8045670      12.191838
5  ENST00000598581.3  ENSG00000268104.3  SLC6A14 1.214757e-13 1.251782e-09  54.98464 125.03073            1 4.395581  6.580565 0.1085976744  6.471967       0.3933200       6.471967
6  ENST00000257264.4  ENSG00000134827.8     TCN1 3.746770e-13 3.217476e-09  52.77162 169.30624            1 2.718831  8.910855 0.2021942715  8.708661       0.8876592       8.708661

Exporting results

We construct a normData data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format

tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition), 
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))

# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"

normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "TranscriptID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")] # rearrange columns
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'TranscriptID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DTE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DTE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[normData$GeneSymbol == "ZIC2", 1:14]
           TranscriptID             GeneID GeneSymbol Chromosome    Start      End TranscriptName                Class Strand Length     normal      0-IIa ERR2675454 ERR2675455
25392 ENST00000376335.8 ENSG00000043355.12       ZIC2      chr13 99981784 99986765       ZIC2-201       protein_coding      +   4981 0.05172293 4.70682916  7.6582729          0
93049 ENST00000490085.5 ENSG00000043355.12       ZIC2      chr13 99984289 99985492       ZIC2-205 processed_transcript      +   1203 0.00000000 0.01930897  0.1930897          0
75590 ENST00000468291.1 ENSG00000043355.12       ZIC2      chr13 99984689 99985492       ZIC2-202 processed_transcript      +    803 0.08067398 0.00000000  0.0000000          0
82745 ENST00000477213.1 ENSG00000043355.12       ZIC2      chr13 99984789 99985474       ZIC2-203 processed_transcript      +    685 0.01601589 0.18323929  0.0000000          0
86234 ENST00000481565.1 ENSG00000043355.12       ZIC2      chr13 99985494 99985904       ZIC2-204 processed_transcript      +    410 0.00000000 0.00000000  0.0000000          0

> deData[deData$GeneSymbol == "ZIC2",]
           TranscriptID             GeneID GeneSymbol Chromosome    Start      End TranscriptName                Class Strand Length baseMean   log2fc       pvalue             padj
25392 ENST00000376335.8 ENSG00000043355.12       ZIC2      chr13 99981784 99986765       ZIC2-201       protein_coding      +   4981 3.022651 6.507808 3.409937e-12 0.00000001098085
93049 ENST00000490085.5 ENSG00000043355.12       ZIC2      chr13 99984289 99985492       ZIC2-205 processed_transcript      +   1203       NA       NA           NA               NA
75590 ENST00000468291.1 ENSG00000043355.12       ZIC2      chr13 99984689 99985492       ZIC2-202 processed_transcript      +    803       NA       NA           NA               NA
82745 ENST00000477213.1 ENSG00000043355.12       ZIC2      chr13 99984789 99985474       ZIC2-203 processed_transcript      +    685       NA 3.516153           NA               NA
86234 ENST00000481565.1 ENSG00000043355.12       ZIC2      chr13 99985494 99985904       ZIC2-204 processed_transcript      +    410       NA       NA           NA               NA

Exploring results

Principal component plot of the samples

png("DTE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()

Plot sample heatmap using the Jensen-Shannon divergence

png("DTE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE, units = "tpm")
dev.off()

Lower divergence values represent samples that are more similar to each other

Plot clustered transcript heatmap

Create expression heatmap for top 40 transcripts with lowest q-values

png("DTE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, units = "tpm", cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40])
dev.off()

Plot the normalized bootstraps across all samples

Plot expression with bootstrap variation for selected transcript

png("DTE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENST00000267294.4", 
units = "tpm", color_by = "condition")
dev.off()

Session info

> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
[1] sleuth_0.30.0     data.table_1.12.8

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6       magrittr_1.5       tidyselect_1.1.0  
 [4] munsell_0.5.0      colorspace_1.4-1   R6_2.4.1          
 [7] rlang_0.4.6        stringr_1.4.0      plyr_1.8.6        
[10] dplyr_0.8.5        tools_3.6.3        parallel_3.6.3    
[13] grid_3.6.3         rhdf5_2.30.1       gtable_0.3.0      
[16] ellipsis_0.3.0     digest_0.6.25      lazyeval_0.2.2    
[19] assertthat_0.2.1   tibble_3.0.1       lifecycle_0.2.0   
[22] crayon_1.3.4       gridExtra_2.3      RColorBrewer_1.1-2
[25] farver_2.0.3       reshape2_1.4.4     tidyr_1.0.3       
[28] purrr_0.3.4        Rhdf5lib_1.8.0     ggplot2_3.3.0     
[31] vctrs_0.3.0        glue_1.4.1         labeling_0.3      
[34] pheatmap_1.0.12    stringi_1.4.6      compiler_3.6.3    
[37] pillar_1.4.4       scales_1.1.1       pkgconfig_2.0.3   

Last updated

Was this helpful?