Create <code>tidybulk</code> tibble.

<!– badges: start –> <!– badges: end –>

In this article we show some examples of the differences in coding between tidybulk/tidyverse and base R. We noted a decrease > 10x of assignments and a decrease of > 2x of line numbers.

Create `tidybulk` tibble.

tt = se_mini

Aggregate duplicated `transcripts`

Tidy transcriptomics “{.r .yellow} rowData(tt)$gene_name = rownames(tt) tt.aggr = tt %>% aggregate_duplicates(.transcript = gene_name) ”

Base R “r temp = data.frame( symbol = dge_list$genes$symbol, dge_list$counts ) dge_list.nr <- by(temp, temp$symbol, function(df) if(length(df[1,1])>0) matrixStats:::colSums(as.matrix(df[,-1])) ) dge_list.nr <- do.call("rbind", dge_list.nr) colnames(dge_list.nr) <- colnames(dge_list) ```

Scale `counts`

Tidy transcriptomics ”r tt.norm = tt.aggr %>% identify_abundant(factor_of_interest = condition) %>% scale_abundance() “

Base R ”r library(edgeR) dgList <- DGEList(count_m=x,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] [...] dgList <- calcNormFactors(dgList, method="TMM") norm_counts.table <- cpm(dgList) ```

Filter `variable transcripts`

We may want to identify and filter variable transcripts.

Tidy transcriptomics “r tt.norm.variable = tt.norm %>% keep_variable() ”

Base R “r library(edgeR) x = norm_counts.table s <- rowMeans((x-rowMeans(x))^2) o <- order(s,decreasing=TRUE) x <- x[o[1L:top],,drop=FALSE] norm_counts.table = norm_counts.table[rownames(x)] norm_counts.table$cell_type = tibble_counts[ match( tibble_counts$sample, rownames(norm_counts.table) ), "Cell type" ] ```

Reduce `dimensions`

Tidy transcriptomics ”r tt.norm.MDS = tt.norm %>% reduce_dimensions(method=“MDS”, .dims = 2) “

Base R ”r library(limma) count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% cmdscale.out %>% setNames(sprintf(“Dim%s”, 1:6)) cmds$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(cmds)), “Cell type” ] “

PCA

Tidy transcriptomics ”r tt.norm.PCA = tt.norm %>% reduce_dimensions(method=“PCA”, .dims = 2) “

Base R ”r count_m_log = log(count_m + 1) pc = count_m_log %>% prcomp(scale = TRUE) variance = pc$sdev^2 variance = (variance / sum(variance))[1:6] pc$cell_type = counts[ match(counts$sample, rownames(pc)), “Cell type” ] “

tSNE

Tidy transcriptomics ”r tt.norm.tSNE = breast_tcga_mini_SE %>% tidybulk( sample, ens, count_scaled) %>% identify_abundant() %>% reduce_dimensions( method = “tSNE”, perplexity=10, pca_scale =TRUE ) “

Base R ”r count_m_log = log(count_m + 1) tsne = Rtsne::Rtsne( t(count_m_log), perplexity=10, pca_scale =TRUE )$Y tsne$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(tsne)), “Cell type” ] “

Rotate `dimensions`

Tidy transcriptomics ”r tt.norm.MDS.rotated = tt.norm.MDS %>% rotate_dimensions(Dim1, Dim2, rotation_degrees = 45, action=“get”) “

Base R ”r rotation = function(m, d) { r = d * pi / 180 ((bind_rows( c(1 = cos®, 2 = -sin®), c(1 = sin®, 2 = cos®) ) %>% as_matrix) %*% m) } mds_r = pca %>% rotation(rotation_degrees) mds_r$cell_type = counts[ match(counts$sample, rownames(mds_r)), “Cell type” ] “

Test `differential abundance`

Tidy transcriptomics ”r tt.de = tt %>% test_differential_abundance( ~ condition, action=“get”) tt.de “

Base R ”r library(edgeR) dgList <- DGEList(counts=counts_m,group=group) keep <- filterByExpr(dgList) dgList <- dgList[keep,,keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) design <- model.matrix(~group) dgList <- estimateDisp(dgList,design) fit <- glmQLFit(dgList,design) qlf <- glmQLFTest(fit,coef=2) topTags(qlf, n=Inf) ```

Adjust `counts`

Tidy transcriptomics “r tt.norm.adj = tt.norm %>% adjust_abundance( ~ condition + time) ”

Base R “r library(sva) count_m_log = log(count_m + 1) design = model.matrix( object = ~ condition + time, data = annotation ) count_m_log.sva = ComBat( batch = design[,2], mod = design, … ) count_m_log.sva = ceiling(exp(count_m_log.sva) -1) count_m_log.sva$cell_type = counts[ match(counts$sample, rownames(count_m_log.sva)), "Cell type” ] “

Deconvolve `Cell type composition`

Tidy transcriptomics ”r tt.cibersort = tt %>% deconvolve_cellularity(action=“get”, cores=1) “

Base R ”r source(‘CIBERSORT.R’) count_m %>% write.table(“mixture_file.txt”) results <- CIBERSORT( "sig_matrix_file.txt", "mixture_file.txt", perm=100, QN=TRUE ) results$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(results)), "Cell type" ] ```

Cluster `samples`

k-means

Tidy transcriptomics “r tt.norm.cluster = tt.norm.MDS %>% cluster_elements(method="kmeans”, centers = 2, action=“get” ) “

Base R ”r count_m_log = log(count_m + 1) k = kmeans(count_m_log, iter.max = 1000, …) cluster = k$cluster cluster$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(cluster)), c(“Cell type”, “Dim1”, “Dim2”) ] “

SNN

Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable.

Tidy transcriptomics ”r tt.norm.SNN = tt.norm.tSNE %>% cluster_elements(method = “SNN”) “

Base R ”r library(Seurat) snn = CreateSeuratObject(count_m) snn = ScaleData( snn, display.progress = TRUE, num.cores=4, do.par = TRUE ) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = FindVariableFeatures(snn, selection.method = “vst”) snn = RunPCA(snn, npcs = 30) snn = FindNeighbors(snn) snn = FindClusters(snn, method = “igraph”, …) snn = snn[[“seurat_clusters”]] snn$cell_type = tibble_counts[ match(tibble_counts$sample, rownames(snn)), c(“Cell type”, “Dim1”, “Dim2”) ] “

Drop `redundant` transcripts

Tidy transcriptomics ”r tt.norm.non_redundant = tt.norm.MDS %>% remove_redundancy( method = “correlation” ) “

Base R ”r library(widyr) .data.correlated = pairwise_cor( counts, sample, transcript, rc, sort = TRUE, diag = FALSE, upper = FALSE ) %>% filter(correlation > correlation_threshold) %>% distinct(item1) %>% rename(!!.element := item1) # Return non redundant data frame counts %>% anti_join(.data.correlated) %>% spread(sample, rc, - transcript) %>% left_join(annotation) “

Draw `heatmap`

tidytranscriptomics ”r tt.norm.MDS %>% # filter lowly abundant keep_abundant() %>% # extract 500 most variable genes keep_variable( .abundance = count_scaled, top = 500) %>% # create heatmap heatmap(sample, transcript, count_scaled, transform = log1p) %>% add_tile(Cell type) “

Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$`Cell type`) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Draw `density plot`

tidytranscriptomics “r # Example taken from airway dataset from BioC2020 workshop. airway %>% tidybulk() %>% identify_abundant() %>% scale_abundance() %>% pivot_longer(cols = starts_with("counts”), names_to = “source”, values_to = “abundance”) %>% filter(!lowly_abundant) %>% ggplot(aes(x=abundance + 1, color=sample)) + geom_density() + facet_wrap(~source) + scale_x_log10() “

Base R ”r # Example taken from airway dataset from BioC2020 workshop. dgList <- SE2DGEList(airway) group <- factor(dgList$samples$dex) keep.exprs <- filterByExpr(dgList, group=group) dgList <- dgList[keep.exprs,, keep.lib.sizes=FALSE] dgList <- calcNormFactors(dgList) logcounts <- cpm(dgList, log=TRUE) var_genes <- apply(logcounts, 1, var) select_var <- names(sort(var_genes, decreasing=TRUE))[1:500] highly_variable_lcpm <- logcounts[select_var,] colours <- c("#440154FF", "#21908CFF", "#fefada" ) col.group <- c("red","grey")[group] gplots::heatmap.2(highly_variable_lcpm, col=colours, trace="none", ColSideColors=col.group, scale="row") ```

Appendix

sessionInfo()

## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.15-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.15-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] tidySummarizedExperiment_1.6.1 SummarizedExperiment_1.26.1   
##  [3] Biobase_2.56.0                 GenomicRanges_1.48.0          
##  [5] GenomeInfoDb_1.32.4            IRanges_2.30.1                
##  [7] S4Vectors_0.34.0               BiocGenerics_0.42.0           
##  [9] MatrixGenerics_1.8.1           matrixStats_0.62.0            
## [11] tidybulk_1.8.2                 ggrepel_0.9.1                 
## [13] ggplot2_3.3.6                  magrittr_2.0.3                
## [15] tibble_3.1.8                   tidyr_1.2.1                   
## [17] dplyr_1.0.10                   knitr_1.40                    
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-159           bitops_1.0-7           bit64_4.0.5           
##  [4] httr_1.4.4             SnowballC_0.7.0        backports_1.4.1       
##  [7] tools_4.2.1            utf8_1.2.2             R6_2.5.1              
## [10] DBI_1.1.3              lazyeval_0.2.2         mgcv_1.8-40           
## [13] colorspace_2.0-3       withr_2.5.0            tidyselect_1.1.2      
## [16] bit_4.0.4              compiler_4.2.1         preprocessCore_1.58.0 
## [19] cli_3.4.1              DelayedArray_0.22.0    plotly_4.10.0         
## [22] scales_1.2.1           readr_2.1.2            genefilter_1.78.0     
## [25] stringr_1.4.1          digest_0.6.29          XVector_0.36.0        
## [28] pkgconfig_2.0.3        htmltools_0.5.3        fastmap_1.1.0         
## [31] limma_3.52.4           htmlwidgets_1.5.4      rlang_1.0.6           
## [34] RSQLite_2.2.17         generics_0.1.3         jsonlite_1.8.0        
## [37] BiocParallel_1.30.3    tokenizers_0.2.3       RCurl_1.98-1.8        
## [40] GenomeInfoDbData_1.2.8 Matrix_1.5-1           Rcpp_1.0.9            
## [43] munsell_0.5.0          fansi_1.0.3            lifecycle_1.0.2       
## [46] stringi_1.7.8          edgeR_3.38.4           zlibbioc_1.42.0       
## [49] plyr_1.8.7             Rtsne_0.16             grid_4.2.1            
## [52] blob_1.2.3             parallel_4.2.1         crayon_1.5.2          
## [55] lattice_0.20-45        Biostrings_2.64.1      splines_4.2.1         
## [58] annotate_1.74.0        hms_1.1.2              KEGGREST_1.36.3       
## [61] locfit_1.5-9.6         pillar_1.8.1           widyr_0.1.5           
## [64] reshape2_1.4.4         codetools_0.2-18       XML_3.99-0.10         
## [67] glue_1.6.2             evaluate_0.16          tidytext_0.3.4        
## [70] data.table_1.14.2      vctrs_0.4.2            png_0.1-7             
## [73] tzdb_0.3.0             gtable_0.3.1           purrr_0.3.4           
## [76] assertthat_0.2.1       cachem_1.0.6           xfun_0.33             
## [79] broom_1.0.1            xtable_1.8-4           janeaustenr_1.0.0     
## [82] survival_3.4-0         viridisLite_0.4.1      AnnotationDbi_1.58.0  
## [85] memoise_2.0.1          sva_3.44.0             ellipsis_0.3.2

Create tidybulk tibble.

Aggregate duplicated transcripts

Scale counts

Filter variable transcripts

Reduce dimensions

PCA

tSNE

Rotate dimensions

Test differential abundance

Adjust counts

Deconvolve Cell type composition

Cluster samples