DGE and DTE analysis of Salmon/Kallisto inputs using Sleuth

Analysis and result presented was performed with Salmon counts, Code snippet to import Kallisto counts is also provided

DGE analysis

cd /home/USER/SSAPs/analysis/dge
R

Load required libraries

library(data.table)
library(sleuth)

Import clinical data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

Construct t2g data.frame

Load the annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g in the specified format by Sleuth

annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))

t2g  = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
          target_id          ens_gene    ext_gene
1 ENST00000456328.2 ENSG00000223972.5     DDX11L1
2 ENST00000450305.2 ENSG00000223972.5     DDX11L1
3 ENST00000488147.1 ENSG00000227232.5      WASH7P
4 ENST00000619216.1 ENSG00000278267.1   MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG

Construct s2c data.frame

Salmon

files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                  ERR2675454                   ERR2675455 
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455" 
                  ERR2675458                   ERR2675459 
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459" 
                  ERR2675460                   ERR2675461 
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461" 
                  ERR2675464                   ERR2675465 
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465" 
                  ERR2675468                   ERR2675469 
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469" 
                  ERR2675472                   ERR2675473 
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473" 
                  ERR2675476                   ERR2675477 
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477" 
                  ERR2675478                   ERR2675479 
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479" 
                  ERR2675480                   ERR2675481 
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481" 
                  ERR2675484                   ERR2675485 
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"    

Create a sample_to_covariates data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                       path
ERR2675454 ERR2675454     0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455    normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459    normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461    normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465    normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469    normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473    normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477    normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479    normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481    normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485    normal ../../salmon-bs/ERR2675485

Sleuth will compare all conditions against the reference level (i.e. first level in the factor variable). Use the levels() function to check the order of the condition and userelevel() function to reassign reference level if necessary

> levels(s2c$condition)
[1] "normal" "0-IIa" 

Kallisto

files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                 ERR2675454                  ERR2675455 
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455" 
                 ERR2675458                  ERR2675459 
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459" 
                 ERR2675460                  ERR2675461 
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461" 
                 ERR2675464                  ERR2675465 
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465" 
                 ERR2675468                  ERR2675469 
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469" 
                 ERR2675472                  ERR2675473 
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473" 
                 ERR2675476                  ERR2675477 
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477" 
                 ERR2675478                  ERR2675479 
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479" 
                 ERR2675480                  ERR2675481 
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481" 
                 ERR2675484                  ERR2675485 
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485" 

Create a sample_to_covariates data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                      path
ERR2675454 ERR2675454     0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455    normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459    normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461    normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465    normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469    normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473    normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477    normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479    normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481    normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485    normal ../../kallisto/ERR2675485

Construct Sleuth object

Construct the Sleuth object so with sleuth_prep. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information

Use aggregation_column = "ens_gene" to allow summarize the data on the gene level and gene_mode = TRUE to do counts aggregation at the gene level for normalization, transformation, and downstream steps.

The default transformation of counts is natural log, i.e. log(x+0.5). We change this to log base 2 by assigning an appropriate transformation_function

so = sleuth_prep(s2c, ~ condition, target_mapping = t2g, 
aggregation_column = "ens_gene", gene_mode = TRUE,
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE, 
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
aggregating by column: ens_gene
17692 genes passed the filter
summarizing bootstraps
....................

Differential expression analysis

The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full", then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model

so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")

The models that have been fit can be examined with the models() function.

> models(so)
[  full  ]
formula:  ~condition 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)
 	condition0-IIa
[  reduced  ]
formula:  ~1 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)

Retrieve results of the test

res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
           target_id ext_gene         pval         qval test_stat       rss degrees_free mean_obs  var_obs     tech_var sigma_sq smooth_sigma_sq final_sigma_sq
1 ENSG00000134193.15     REG4 7.752172e-15 4.571714e-11  60.39724 119.68635            1 9.429405 6.299281 0.0005691108 6.298712      0.11357166       6.298712
2 ENSG00000062038.14     CDH3 7.599095e-15 4.571714e-11  60.43649 116.97036            1 2.960657 6.156335 0.1020903488 6.054245      0.28391752       6.054245
3  ENSG00000229637.4    PRAC2 4.337807e-15 4.571714e-11  61.54027  83.68869            1 1.457273 4.404668 0.0477239422 4.356944      0.45623196       4.356944
4  ENSG00000120875.9    DUSP4 2.600851e-14 1.150356e-10  58.01531  68.69969            1 5.183719 3.615773 0.0175501251 3.598223      0.11098455       3.598223
5 ENSG00000181577.16 C6orf223 4.235939e-14 1.498845e-10  57.05592 118.50929            1 3.753971 6.237331 0.1957375093 6.041594      0.19989544       6.041594
6 ENSG00000196188.11     CTSE 1.287869e-13 2.488331e-10  54.86977  81.91557            1 7.031097 4.311346 0.0025249259 4.308821      0.09906819       4.308821

Exporting results

We construct a normData data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format

tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition), 
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))

# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"

# Load gene-level annotation
annoData = "/home/USER/db/refanno/gencode.v33.annotation_genes.txt"
annoData = data.frame(fread(annoData))

normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "GeneID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")]
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'GeneID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DGE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DGE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[1:10,1:14]
                 GeneID  GeneSymbol Chromosome Start    End                              Class Strand Length     normal       0-IIa ERR2675454 ERR2675455 ERR2675458 ERR2675459
24914 ENSG00000223972.5     DDX11L1       chr1 11869  14409 transcribed_unprocessed_pseudogene      +   2540 0.02578694 0.002660157  0.0000000  0.0000000   0.000000   0.000000
27153 ENSG00000227232.5      WASH7P       chr1 14404  29570             unprocessed_pseudogene      -  15166 9.64654711 7.418128917  5.0318657  7.0457119   5.043944  11.669079
54425 ENSG00000278267.1   MIR6859-1       chr1 17369  17436                              miRNA      -     67 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
36601 ENSG00000243485.5 MIR1302-2HG       chr1 29554  31109                             lncRNA      +   1555 0.01174571 0.000000000  0.0000000  0.1174571   0.000000   0.000000
57122 ENSG00000284332.1   MIR1302-2       chr1 30366  30503                              miRNA      +    137 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
34149 ENSG00000237613.2     FAM138A       chr1 34554  36081                             lncRNA      -   1527 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
49448 ENSG00000268020.3      OR4G4P       chr1 52473  53312             unprocessed_pseudogene      +    839 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
35334 ENSG00000240361.2     OR4G11P       chr1 57598  64116 transcribed_unprocessed_pseudogene      +   6518 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
16136 ENSG00000186092.6       OR4F5       chr1 65419  71585                     protein_coding      +   6166 0.00000000 0.000000000  0.0000000  0.0000000   0.000000   0.000000
34420 ENSG00000238009.6  AL627309.1       chr1 89295 133723                             lncRNA      -  44428 0.37463259 0.356671874  0.1921258  0.5975709   0.389049   0.307692

> deData[1:10,]
                 GeneID  GeneSymbol Chromosome Start    End                              Class Strand Length baseMean      log2fc    pvalue      padj
24914 ENSG00000223972.5     DDX11L1       chr1 11869  14409 transcribed_unprocessed_pseudogene      +   2540       NA -3.27705754        NA        NA
27153 ENSG00000227232.5      WASH7P       chr1 14404  29570             unprocessed_pseudogene      -  15166 5.289557 -0.37895730 0.1989157 0.2871868
54425 ENSG00000278267.1   MIR6859-1       chr1 17369  17436                              miRNA      -     67       NA          NA        NA        NA
36601 ENSG00000243485.5 MIR1302-2HG       chr1 29554  31109                             lncRNA      +   1555       NA          NA        NA        NA
57122 ENSG00000284332.1   MIR1302-2       chr1 30366  30503                              miRNA      +    137       NA          NA        NA        NA
34149 ENSG00000237613.2     FAM138A       chr1 34554  36081                             lncRNA      -   1527       NA          NA        NA        NA
49448 ENSG00000268020.3      OR4G4P       chr1 52473  53312             unprocessed_pseudogene      +    839       NA          NA        NA        NA
35334 ENSG00000240361.2     OR4G11P       chr1 57598  64116 transcribed_unprocessed_pseudogene      +   6518       NA          NA        NA        NA
16136 ENSG00000186092.6       OR4F5       chr1 65419  71585                     protein_coding      +   6166       NA          NA        NA        NA
34420 ENSG00000238009.6  AL627309.1       chr1 89295 133723                             lncRNA      -  44428 1.263687 -0.07087897 0.2654303 0.3487556

Exploring results

Principal component plot of the samples

png("DGE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()

Plot sample heatmap using the Jensen-Shannon divergence

png("DGE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE)
dev.off()

Lower divergence values represent samples that are more similar to each other

Plot clustered transcript heatmap

Create expression heatmap for top 40 genes with lowest q-values

png("DGE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40]) # top40
dev.off()

Plot the normalized bootstraps across all samples

Plot expression with bootstrap variation for selected gene

png("DGE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENSG00000062038.14", 
units = "tpm", color_by = "condition")
dev.off()

Session info

> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
[1] sleuth_0.30.0     data.table_1.12.8

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6       RColorBrewer_1.1-2 pillar_1.4.4      
 [4] compiler_3.6.3     plyr_1.8.6         tools_3.6.3       
 [7] digest_0.6.25      lifecycle_0.2.0    tibble_3.0.1      
[10] gtable_0.3.0       rhdf5_2.30.1       pkgconfig_2.0.3   
[13] rlang_0.4.6        cli_2.0.2          parallel_3.6.3    
[16] gridExtra_2.3      dplyr_0.8.5        stringr_1.4.0     
[19] vctrs_0.3.0        grid_3.6.3         tidyselect_1.1.0  
[22] glue_1.4.1         R6_2.4.1           fansi_0.4.1       
[25] pheatmap_1.0.12    ggplot2_3.3.0      Rhdf5lib_1.8.0    
[28] purrr_0.3.4        tidyr_1.0.3        reshape2_1.4.4    
[31] farver_2.0.3       magrittr_1.5       scales_1.1.1      
[34] ellipsis_0.3.0     assertthat_0.2.1   colorspace_1.4-1  
[37] labeling_0.3       stringi_1.4.6      lazyeval_0.2.2    
[40] munsell_0.5.0      aggregation_1.0.1  crayon_1.3.4      

DTE analysis

cd /home/USER/SSAPs/analysis/dte
R

Load required libraries

library(data.table)
library(sleuth)

Import clinical data

Load clinical information and define file path

dir = "../../"
sampleData = paste0(dir, "clinical.txt")
sampleData = fread(sampleData)
rownames(sampleData) = sampleData$ENA_RUN
sampleData$individual = as.factor(sampleData$individual)
sampleData$paris_classification = as.factor(sampleData$paris_classification)

# Use relevel() to set adjacent normal samples as reference
sampleData$paris_classification = relevel(sampleData$paris_classification, ref = "normal")

Construct t2g data.frame

Load the annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file and constrct a transcript-to-gene data.frame t2g in the specified format by Sleuth

annoData = "/home/USER/db/refanno/gencode.v33.annotation_transcripts.txt"
annoData = data.frame(fread(annoData))

t2g  = annoData[,c("TranscriptID","GeneID","GeneSymbol")]
colnames(t2g) = c("target_id","ens_gene","ext_gene")
> head(t2g)
          target_id          ens_gene    ext_gene
1 ENST00000456328.2 ENSG00000223972.5     DDX11L1
2 ENST00000450305.2 ENSG00000223972.5     DDX11L1
3 ENST00000488147.1 ENSG00000227232.5      WASH7P
4 ENST00000619216.1 ENSG00000278267.1   MIR6859-1
5 ENST00000473358.1 ENSG00000243485.5 MIR1302-2HG
6 ENST00000469289.1 ENSG00000243485.5 MIR1302-2HG

Construct s2c data.frame

Salmon

files = file.path(paste0(dir, "salmon-bs"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                  ERR2675454                   ERR2675455 
"../../salmon-bs/ERR2675454" "../../salmon-bs/ERR2675455" 
                  ERR2675458                   ERR2675459 
"../../salmon-bs/ERR2675458" "../../salmon-bs/ERR2675459" 
                  ERR2675460                   ERR2675461 
"../../salmon-bs/ERR2675460" "../../salmon-bs/ERR2675461" 
                  ERR2675464                   ERR2675465 
"../../salmon-bs/ERR2675464" "../../salmon-bs/ERR2675465" 
                  ERR2675468                   ERR2675469 
"../../salmon-bs/ERR2675468" "../../salmon-bs/ERR2675469" 
                  ERR2675472                   ERR2675473 
"../../salmon-bs/ERR2675472" "../../salmon-bs/ERR2675473" 
                  ERR2675476                   ERR2675477 
"../../salmon-bs/ERR2675476" "../../salmon-bs/ERR2675477" 
                  ERR2675478                   ERR2675479 
"../../salmon-bs/ERR2675478" "../../salmon-bs/ERR2675479" 
                  ERR2675480                   ERR2675481 
"../../salmon-bs/ERR2675480" "../../salmon-bs/ERR2675481" 
                  ERR2675484                   ERR2675485 
"../../salmon-bs/ERR2675484" "../../salmon-bs/ERR2675485"    

Create a data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                       path
ERR2675454 ERR2675454     0-IIa ../../salmon-bs/ERR2675454
ERR2675455 ERR2675455    normal ../../salmon-bs/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../salmon-bs/ERR2675458
ERR2675459 ERR2675459    normal ../../salmon-bs/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../salmon-bs/ERR2675460
ERR2675461 ERR2675461    normal ../../salmon-bs/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../salmon-bs/ERR2675464
ERR2675465 ERR2675465    normal ../../salmon-bs/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../salmon-bs/ERR2675468
ERR2675469 ERR2675469    normal ../../salmon-bs/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../salmon-bs/ERR2675472
ERR2675473 ERR2675473    normal ../../salmon-bs/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../salmon-bs/ERR2675476
ERR2675477 ERR2675477    normal ../../salmon-bs/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../salmon-bs/ERR2675478
ERR2675479 ERR2675479    normal ../../salmon-bs/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../salmon-bs/ERR2675480
ERR2675481 ERR2675481    normal ../../salmon-bs/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../salmon-bs/ERR2675484
ERR2675485 ERR2675485    normal ../../salmon-bs/ERR2675485

Sleuth will compare all conditions against the reference level (i.e. first level in the factor variable). Use the levels() function to check the order of the condition and userelevel() function to reassign reference level if necessary

> levels(s2c$condition)
[1] "normal" "0-IIa" 

Kallisto

files = file.path(paste0(dir, "kallisto"), sampleData$ENA_RUN)
names(files) = sampleData$ENA_RUN
> files
                 ERR2675454                  ERR2675455 
"../../kallisto/ERR2675454" "../../kallisto/ERR2675455" 
                 ERR2675458                  ERR2675459 
"../../kallisto/ERR2675458" "../../kallisto/ERR2675459" 
                 ERR2675460                  ERR2675461 
"../../kallisto/ERR2675460" "../../kallisto/ERR2675461" 
                 ERR2675464                  ERR2675465 
"../../kallisto/ERR2675464" "../../kallisto/ERR2675465" 
                 ERR2675468                  ERR2675469 
"../../kallisto/ERR2675468" "../../kallisto/ERR2675469" 
                 ERR2675472                  ERR2675473 
"../../kallisto/ERR2675472" "../../kallisto/ERR2675473" 
                 ERR2675476                  ERR2675477 
"../../kallisto/ERR2675476" "../../kallisto/ERR2675477" 
                 ERR2675478                  ERR2675479 
"../../kallisto/ERR2675478" "../../kallisto/ERR2675479" 
                 ERR2675480                  ERR2675481 
"../../kallisto/ERR2675480" "../../kallisto/ERR2675481" 
                 ERR2675484                  ERR2675485 
"../../kallisto/ERR2675484" "../../kallisto/ERR2675485" 

Create a sample_to_covariates data.frame s2c that contains the required information in the specified format by Sleuth

s2c = data.frame(sample = sampleData$ENA_RUN, 
condition = sampleData$paris_classification, 
path = files, stringsAsFactors = FALSE)
> s2c
               sample condition                      path
ERR2675454 ERR2675454     0-IIa ../../kallisto/ERR2675454
ERR2675455 ERR2675455    normal ../../kallisto/ERR2675455
ERR2675458 ERR2675458     0-IIa ../../kallisto/ERR2675458
ERR2675459 ERR2675459    normal ../../kallisto/ERR2675459
ERR2675460 ERR2675460     0-IIa ../../kallisto/ERR2675460
ERR2675461 ERR2675461    normal ../../kallisto/ERR2675461
ERR2675464 ERR2675464     0-IIa ../../kallisto/ERR2675464
ERR2675465 ERR2675465    normal ../../kallisto/ERR2675465
ERR2675468 ERR2675468     0-IIa ../../kallisto/ERR2675468
ERR2675469 ERR2675469    normal ../../kallisto/ERR2675469
ERR2675472 ERR2675472     0-IIa ../../kallisto/ERR2675472
ERR2675473 ERR2675473    normal ../../kallisto/ERR2675473
ERR2675476 ERR2675476     0-IIa ../../kallisto/ERR2675476
ERR2675477 ERR2675477    normal ../../kallisto/ERR2675477
ERR2675478 ERR2675478     0-IIa ../../kallisto/ERR2675478
ERR2675479 ERR2675479    normal ../../kallisto/ERR2675479
ERR2675480 ERR2675480     0-IIa ../../kallisto/ERR2675480
ERR2675481 ERR2675481    normal ../../kallisto/ERR2675481
ERR2675484 ERR2675484     0-IIa ../../kallisto/ERR2675484
ERR2675485 ERR2675485    normal ../../kallisto/ERR2675485

Construct Sleuth object

Construct the Sleuth object so with sleuth_prep. This object will store the metadata, model to be used for differential testing, transcript-to-gene and bootstrap information.

The default transformation of counts is natural log, i.e. log(x+0.5). We change this to log base 2 by assigning an appropriate transformation_function

so = sleuth_prep(s2c, ~ condition, 
target_mapping = t2g, 
extra_bootstrap_summary = TRUE, read_bootstrap_tpm = TRUE, 
transformation_function = function(x) log2(x + 0.5))
reading in kallisto results
dropping unused factor levels
....................
normalizing est_counts
51524 targets passed the filter
normalizing tpm
merging in metadata
summarizing bootstraps
....................

Differential expression analysis

The likelihood ratio test is performed with a 3-step procedure. First, fit the full model by fit_name = "full", then performs a second fit to a “reduced” model that presumes abundances are equal in the two conditions. Finally, Sleuth will perform LRT to identify transcripts with a significantly better fit with the “full” model

so = sleuth_fit(so, ~condition, fit_name = "full")
so = sleuth_fit(so, ~1, fit_name = "reduced")
so = sleuth_lrt(so, null_model = "reduced", alt_model = "full")

The models that have been fit can be examined with the models() function.

> models(so)
[  full  ]
formula:  ~condition 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)
 	condition0-IIa
[  reduced  ]
formula:  ~1 
data modeled:  obs_counts 
transform sync'ed:  TRUE 
coefficients:
	(Intercept)

Retrieve results of the test

res = sleuth_results(so, test = "reduced:full", test_type = "lrt", show_all = TRUE)
> head(res)
           target_id           ens_gene ext_gene         pval         qval test_stat       rss degrees_free mean_obs   var_obs     tech_var  sigma_sq smooth_sigma_sq final_sigma_sq
1 ENST00000256585.10 ENSG00000134193.15     REG4 7.809240e-15 2.011816e-10  60.38280 119.93926            1 9.275326  6.312593 0.0006011685  6.311992       0.1057915       6.311992
2  ENST00000369401.4 ENSG00000134193.15     REG4 4.500564e-15 2.011816e-10  61.46774 109.01207            1 5.515326  5.737477 0.0265451861  5.710932       0.2012956       5.710932
3  ENST00000240100.7  ENSG00000120875.9    DUSP4 3.638438e-14 6.248897e-10  57.35496  68.99341            1 5.429044  3.631232 0.0179548375  3.613277       0.2110772       3.613277
4  ENST00000293599.7  ENSG00000161798.7     AQP5 8.505383e-14 1.095578e-09  55.68526 233.22374            1 3.003379 12.274934 0.0830957976 12.191838       0.8045670      12.191838
5  ENST00000598581.3  ENSG00000268104.3  SLC6A14 1.214757e-13 1.251782e-09  54.98464 125.03073            1 4.395581  6.580565 0.1085976744  6.471967       0.3933200       6.471967
6  ENST00000257264.4  ENSG00000134827.8     TCN1 3.746770e-13 3.217476e-09  52.77162 169.30624            1 2.718831  8.910855 0.2021942715  8.708661       0.8876592       8.708661

Exporting results

We construct a normData data.frame to store per-group normalised mean transcripts per million (TPM) and normalised TPM of all samples, and a deData data.frame to store the Sleuth LRT results. We merge both data.frame with a annoData data.frame that contain per-transcript annotation derived from GENCODE GTF file before exporting the results in tabular format

tpm_norm = as.data.frame(sleuth_to_matrix(so, "obs_norm", "tpm"))
tpm_mean = as.data.frame(sapply( levels(s2c$condition), 
function(lvl) rowMeans( tpm_norm[, s2c$condition == lvl, drop = FALSE] ) ))

# log2 fold change in expression
tpm.log2fc = log2(tpm_mean[2]/tpm_mean[1])
colnames(tpm.log2fc) = "log2fc"

normData = merge(annoData, merge(tpm_mean, tpm_norm, by = "row.names"), by.x = "TranscriptID", by.y = "Row.names")
normData = normData[order(normData$Chromosome, normData$Start, normData$End),]

deData = merge(res[,c("target_id","mean_obs","pval","qval")], tpm.log2fc, by.x = "target_id", by.y = "row.names")
colnames(deData) = c("target_id","baseMean","pvalue","padj","log2fc")
deData = deData[,c("target_id","baseMean","log2fc","pvalue","padj")] # rearrange columns
deData[is.nan(deData$log2fc),]$log2fc = NA
deData[is.infinite(deData$log2fc),]$log2fc = NA
deData = merge(annoData, deData, by.x = 'TranscriptID', by.y = 'target_id')
deData = deData[order(deData$Chromosome, deData$Start, deData$End),]

write.table(normData, file="DTE_sleuth_Means_and_NormalisedTPM.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
write.table(deData, file="DTE_sleuth_DE_results.Salmon.txt", sep = "\t", quote = F, row.names = F, col.names = T)
> normData[normData$GeneSymbol == "ZIC2", 1:14]
           TranscriptID             GeneID GeneSymbol Chromosome    Start      End TranscriptName                Class Strand Length     normal      0-IIa ERR2675454 ERR2675455
25392 ENST00000376335.8 ENSG00000043355.12       ZIC2      chr13 99981784 99986765       ZIC2-201       protein_coding      +   4981 0.05172293 4.70682916  7.6582729          0
93049 ENST00000490085.5 ENSG00000043355.12       ZIC2      chr13 99984289 99985492       ZIC2-205 processed_transcript      +   1203 0.00000000 0.01930897  0.1930897          0
75590 ENST00000468291.1 ENSG00000043355.12       ZIC2      chr13 99984689 99985492       ZIC2-202 processed_transcript      +    803 0.08067398 0.00000000  0.0000000          0
82745 ENST00000477213.1 ENSG00000043355.12       ZIC2      chr13 99984789 99985474       ZIC2-203 processed_transcript      +    685 0.01601589 0.18323929  0.0000000          0
86234 ENST00000481565.1 ENSG00000043355.12       ZIC2      chr13 99985494 99985904       ZIC2-204 processed_transcript      +    410 0.00000000 0.00000000  0.0000000          0

> deData[deData$GeneSymbol == "ZIC2",]
           TranscriptID             GeneID GeneSymbol Chromosome    Start      End TranscriptName                Class Strand Length baseMean   log2fc       pvalue             padj
25392 ENST00000376335.8 ENSG00000043355.12       ZIC2      chr13 99981784 99986765       ZIC2-201       protein_coding      +   4981 3.022651 6.507808 3.409937e-12 0.00000001098085
93049 ENST00000490085.5 ENSG00000043355.12       ZIC2      chr13 99984289 99985492       ZIC2-205 processed_transcript      +   1203       NA       NA           NA               NA
75590 ENST00000468291.1 ENSG00000043355.12       ZIC2      chr13 99984689 99985492       ZIC2-202 processed_transcript      +    803       NA       NA           NA               NA
82745 ENST00000477213.1 ENSG00000043355.12       ZIC2      chr13 99984789 99985474       ZIC2-203 processed_transcript      +    685       NA 3.516153           NA               NA
86234 ENST00000481565.1 ENSG00000043355.12       ZIC2      chr13 99985494 99985904       ZIC2-204 processed_transcript      +    410       NA       NA           NA               NA

Exploring results

Principal component plot of the samples

png("DTE_sleuth_PCA.png", width=7, height=7, units = "in", res = 300)
plot_pca(so, color_by = "condition", text_labels = TRUE, units = "tpm")
dev.off()

Plot sample heatmap using the Jensen-Shannon divergence

png("DTE_sleuth_sample_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_sample_heatmap(so, use_filtered = FALSE, units = "tpm")
dev.off()

Lower divergence values represent samples that are more similar to each other

Plot clustered transcript heatmap

Create expression heatmap for top 40 transcripts with lowest q-values

png("DTE_sleuth_transcript_heatmap.png", width=7, height=7, units = "in", res = 300)
plot_transcript_heatmap(so, units = "tpm", cluster_transcripts = TRUE,
transcripts = subset(res[order(res$qval),], qval < 0.05)$target_id[1:40])
dev.off()

Plot the normalized bootstraps across all samples

Plot expression with bootstrap variation for selected transcript

png("DTE_sleuth_plot_bootstrap.png", width=7, height=7, units = "in", res = 300)
plot_bootstrap(so, target_id = "ENST00000267294.4", 
units = "tpm", color_by = "condition")
dev.off()

Session info

> sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.4 LTS

Matrix products: default
BLAS/LAPACK: /home/USER/miniconda3/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
 [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
 [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods  
[7] base     

other attached packages:
[1] sleuth_0.30.0     data.table_1.12.8

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6       magrittr_1.5       tidyselect_1.1.0  
 [4] munsell_0.5.0      colorspace_1.4-1   R6_2.4.1          
 [7] rlang_0.4.6        stringr_1.4.0      plyr_1.8.6        
[10] dplyr_0.8.5        tools_3.6.3        parallel_3.6.3    
[13] grid_3.6.3         rhdf5_2.30.1       gtable_0.3.0      
[16] ellipsis_0.3.0     digest_0.6.25      lazyeval_0.2.2    
[19] assertthat_0.2.1   tibble_3.0.1       lifecycle_0.2.0   
[22] crayon_1.3.4       gridExtra_2.3      RColorBrewer_1.1-2
[25] farver_2.0.3       reshape2_1.4.4     tidyr_1.0.3       
[28] purrr_0.3.4        Rhdf5lib_1.8.0     ggplot2_3.3.0     
[31] vctrs_0.3.0        glue_1.4.1         labeling_0.3      
[34] pheatmap_1.0.12    stringi_1.4.6      compiler_3.6.3    
[37] pillar_1.4.4       scales_1.1.1       pkgconfig_2.0.3   

Last updated