COTAN 2.0.5
options(parallelly.fork.enable = TRUE)
library(COTAN)
library(zeallot)
library(data.table)
library(factoextra)
library(Rtsne)
library(qpdf)
library(GEOquery)
This tutorial contains the same functionalities as the first release of the COTAN tutorial but done using the new and updated functions.
Download the data-set for "mouse cortex E17.5"
.
dataDir <- tempdir()
dataSetFile <- file.path(dataDir, "GSM2861514/GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz")
if (!file.exists(dataSetFile)) {
getGEOSuppFiles("GSM2861514", makeDirectory = TRUE,
baseDir = dataDir, fetch_files = TRUE,
filter_regex = "GSM2861514_E175_Only_Cortical_Cells_DGE.txt.gz")
sample.dataset <- read.csv(dataSetFile, sep = "\t", row.names = 1L)
}
Define a directory where the output will be stored.
outDir <- tempdir()
# Log-level 2 was chosen to showcase better how the package works
# In normal usage a level of 0 or 1 is more appropriate
setLoggingLevel(2)
#> Setting new log level to 2
# This file will contain all the logs produced by the package
# as if at the highest logging level
setLoggingFile(file.path(outDir, "vignette_v1.log"))
#> Setting log file to be: /tmp/RtmpDeAJNm/vignette_v1.log
Initialize the COTAN
object with the row count table and
the metadata for the experiment.
cond = "mouse_cortex_E17.5"
#cond = "test"
#obj = COTAN(raw = sampled.dataset)
obj = COTAN(raw = sample.dataset)
obj = initializeMetaDataset(obj,
GEO = "GSM2861514",
sequencingMethod = "Drop_seq",
sampleCondition = cond)
#> Initializing `COTAN` meta-data
logThis(paste0("Condition ", getMetadataElement(obj, datasetTags()[["cond"]])),
logLevel = 1)
#> Condition mouse_cortex_E17.5
Before we proceed to the analysis, we need to clean the data. The analysis will use a matrix of raw UMI counts as the input. To obtain this matrix, we have to remove any potential cell doublets or multiplets, as well as any low quality or dying cells.
We can check the library size (UMI number) with an empirical cumulative distribution function
ECDPlot(obj, yCut = 700)
cellSizePlot(obj)
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
genesSizePlot(obj)
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
#> Warning: Removed 1 rows containing missing values (`geom_point()`).
mit <- mitochondrialPercentagePlot(obj, genePrefix = "^Mt")
mit[["plot"]]
During the cleaning, every time we want to remove cells or genes
we can use the dropGenesCells()
function.
To drop cells by cell library size:
cells_to_rem <- getCells(obj)[getCellsSize(obj) > 6000]
obj <- dropGenesCells(obj, cells = cells_to_rem)
To drop cells by gene number:
cellGeneNumber <- sort(colSums(as.data.frame(getRawData(obj) > 0)),
decreasing = FALSE)
cells_to_rem <- names(cellGeneNumber)[cellGeneNumber > 3000]
obj <- dropGenesCells(obj, cells = cells_to_rem)
genesSizePlot(obj)
#> Scale for y is already present.
#> Adding another scale for y, which will replace the existing scale.
#> Warning: Removed 1 rows containing missing values (`geom_point()`).
To drop cells by mitochondrial percentage:
to_rem <- mit[["sizes"]][["mit.percentage"]] > 1.5
cells_to_rem <- rownames(mit[["sizes"]])[to_rem]
obj <- dropGenesCells(obj, cells = cells_to_rem)
mit <- mitochondrialPercentagePlot(obj, genePrefix = "^Mt")
mit[["plot"]]
If we do not want to consider the mitochondrial genes we can remove them before starting the analysis.
genes_to_rem = getGenes(obj)[grep('^Mt', getGenes(obj))]
cells_to_rem = getCells(obj)[which(getCellsSize(obj) == 0)]
obj = dropGenesCells(obj, genes_to_rem, cells_to_rem)
We want also to log the current status.
logThis(paste("n cells", getNumCells(obj)), logLevel = 1)
#> n cells 873
The clean()
function estimates all the parameters for the data. Therefore, we have to run it again every time we remove any genes or cells from the data.
n_it <- 1
obj <- clean(obj)
#> Genes/cells selection done: dropped [4744] genes and [0] cells
#> Working on [12278] genes and [873] cells
c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot) %<-% cleanPlots(obj)
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
pcaCellsPlot
genesPlot
We can observe here that the red cells are really enriched in hemoglobin genes so we prefer to remove them. They can be extracted from the pcaCellsData
object and removed.
cells_to_rem <- rownames(pcaCellsData)[pcaCellsData[["groups"]] == "B"]
obj <- dropGenesCells(obj, cells = cells_to_rem)
n_it <- 2
obj <- clean(obj)
#> Genes/cells selection done: dropped [8] genes and [0] cells
#> Working on [12270] genes and [868] cells
c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot) %<-% cleanPlots(obj)
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
pcaCellsPlot
To color the PCA based on nu
(so the cells’ efficiency)
UDEPlot
UDE (color) should not correlate with principal components! This is very important.
The next part is used to remove the cells with efficiency too low.
plot(nuPlot)
We can zoom on the smallest values and, if we detect a clear elbow, we can decide to remove the cells.
nuDf = data.frame("nu" = sort(getNu(obj)), "n" = seq_along(getNu(obj)))
yset = 0.35 # the threshold to remove low UDE cells
plot.ude <- ggplot(nuDf, aes(x = n, y = nu)) +
geom_point(colour = "#8491B4B2", size = 1) +
xlim(0, 400) +
ylim(0, 1) +
geom_hline(yintercept = yset, linetype = "dashed",
color = "darkred") +
annotate(geom = "text", x = 200, y = 0.25,
label = paste0("to remove cells with nu < ", yset),
color = "darkred", size = 4.5)
plot.ude
#> Warning: Removed 468 rows containing missing values (`geom_point()`).
We also save the defined threshold in the metadata and re-run the estimation
obj <- addElementToMetaDataset(obj, "Threshold low UDE cells:", yset)
cells_to_rem = rownames(nuDf)[nuDf[["nu"]] < yset]
obj <- dropGenesCells(obj, cells = cells_to_rem)
Repeat the estimation after the cells are removed
n_it <- 3
obj <- clean(obj)
#> Genes/cells selection done: dropped [30] genes and [0] cells
#> Working on [12240] genes and [841] cells
c(pcaCellsPlot, pcaCellsData, genesPlot, UDEPlot, nuPlot) %<-% cleanPlots(obj)
#> PCA: START
#> PCA: DONE
#> Hierarchical clustering: START
#> Hierarchical clustering: DONE
pcaCellsPlot
logThis(paste("n cells", getNumCells(obj)), logLevel = 1)
#> n cells 841
In this part, all the contingency tables are computed and used to get the statistics.
obj = estimateDispersionBisection(obj, cores = 10)
#> Estimate dispersion: START
#> Estimate dispersion: DONE
#> dispersion | min: -0.057464599609375 | max: 369.5 | % negative: 19.5506535947712
COEX
evaluation and storing
obj <- calculateCoex(obj)
#> Calculate genes' coex: START
#> Warning in asMethod(object): sparse->dense coercion: allocating vector of size
#> 1.1 GiB
#> Calculate genes' coex: DONE
# saving the structure
saveRDS(obj, file = file.path(outDir, paste0(cond, ".cotan.RDS")))
It is also possible to run directly a single function if we don’t want to clean anything.
obj2 <- automaticCOTANObjectCreation(
raw = sample.dataset,
GEO = "GSM2861514",
sequencingMethod = "Drop_seq",
sampleCondition = cond,
saveObj = TRUE, outDir = outDir, cores = 10)
To calculate the GDI
we can run:
quant.p = calculateGDI(obj)
#> Calculating S: START
#> Calculating S: DONE
#> Calculate GDI dataframe: START
#> Calculate GDI dataframe: DONE
head(quant.p)
#> sum.raw.norm GDI exp.cells
#> 0610007N19Rik 3.344461 1.618232 2.259215
#> 0610007P14Rik 5.259260 1.381777 18.906064
#> 0610009B22Rik 4.745286 1.273849 11.771700
#> 0610009D07Rik 6.326166 1.334185 44.114150
#> 0610009E02Rik 2.601880 1.016055 1.189061
#> 0610009L18Rik 3.521403 1.255381 3.091558
The next function can either plot the GDI
for the dataset directly or
use the pre-computed dataframe.
It marks a 1.5
threshold (in red) and
the two highest quantiles (in blue) by default.
We can also specify some gene sets (three in this case) that
we want to label explicitly in the GDI
plot.
genesList <- list(
"NPGs" = c("Nes", "Vim", "Sox2", "Sox1", "Notch1", "Hes1", "Hes5", "Pax6"),
"PNGs" = c("Map2", "Tubb3", "Neurod1", "Nefm", "Nefl", "Dcx", "Tbr1"),
"hk" = c("Calm1", "Cox6b1", "Ppia", "Rpl18", "Cox7c", "Erh", "H3f3a",
"Taf1", "Taf2", "Gapdh", "Actb", "Golph3", "Zfr", "Sub1",
"Tars", "Amacr")
)
# needs to be recalculated after the changes in nu/dispersion above
#obj <- calculateCoex(obj, actOnCells = FALSE)
GDIPlot(obj, cond = cond, genes = genesList)
#> GDI plot
#> Calculating S: START
#> Calculating S: DONE
#> Calculate GDI dataframe: START
#> Calculate GDI dataframe: DONE
#> Removed 0 low GDI genes (such as the fully-expressed) in GDI plot
The percentage of cells expressing the gene in the third column of this data-frame is reported.
To perform the Gene Pair Analysis, we can plot a heatmap of the COEX
values
between two gene sets.
We have to define the different gene sets (list.genes
) in a list.
Then we can choose which sets to use in the function parameter sets
(for example, from 1 to 3).
We also have to provide an array of the file name prefixes for each condition
(for example, “mouse_cortex_E17.5”).
In fact, this function can plot genes relationships across many different
conditions to get a complete overview.
print(cond)
#> [1] "mouse_cortex_E17.5"
heatmapPlot(genesLists = genesList, sets = c(1:3),
conditions = c(cond), dir = outDir)
#> heatmap plot: START
#> Calculating S: START
#> Calculating S: DONE
#> calculating PValues: START
#> Get p-values on a set of genes on columns and on a set of genes on rows
#> calculating PValues: DONE
#> min coex: -0.481187309336149 max coex: 0.467915753149414
We can also plot a general heatmap of COEX
values based on some markers like
the following one.
genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Vim", "Hes1"),
pValueThreshold = 0.001, symmetric = TRUE)
#> Calculating S: START
#> Calculating S: DONE
#> calculating PValues: START
#> Get p-values genome wide on columns and genome wide on rows
#> calculating PValues: DONE
genesHeatmapPlot(obj, primaryMarkers = c("Satb2", "Bcl11b", "Fezf2"),
secondaryMarkers = c("Gabra3", "Meg3", "Cux1", "Neurod6"),
pValueThreshold = 0.001, symmetric = FALSE)
#> Calculating S: START
#> Calculating S: DONE
#> calculating PValues: START
#> Get p-values genome wide on columns and genome wide on rows
#> calculating PValues: DONE
Sometimes we can also be interested in the numbers present directly in the contingency tables for two specific genes. To get them we can use two functions:
contingencyTables()
to produce the observed and expected data
c(observedCT, expectedCT) %<-% contingencyTables(obj, g1 = "Satb2",
g2 = "Bcl11b")
print("Observed CT")
#> [1] "Observed CT"
observedCT
#> Satb2.yes Satb2.no
#> Bcl11b.yes 47 149
#> Bcl11b.no 287 358
print("Expected CT")
#> [1] "Expected CT"
expectedCT
#> Satb2.yes Satb2.no
#> Bcl11b.yes 83.50828 112.4908
#> Bcl11b.no 250.49207 394.5088
Another useful function is getGenesCoex()
. This can be used to extract
the whole or a partial COEX
matrix from a COTAN
object.
# For the whole matrix
coex <- getGenesCoex(obj, zeroDiagonal = FALSE)
coex[1:5, 1:5]
#> 5 x 5 Matrix of class "dspMatrix"
#> 0610007N19Rik 0610007P14Rik 0610009B22Rik 0610009D07Rik
#> 0610007N19Rik 0.672634305 -0.020840728 0.0342777099 0.0118370376
#> 0610007P14Rik -0.020840728 0.927688746 0.0121283964 -0.0092518514
#> 0610009B22Rik 0.034277710 0.012128396 0.9258121900 -0.0005812182
#> 0610009D07Rik 0.011837038 -0.009251851 -0.0005812182 0.9256867093
#> 0610009E02Rik -0.009531833 -0.058088014 0.0520859067 0.0228546807
#> 0610009E02Rik
#> 0610007N19Rik -0.009531833
#> 0610007P14Rik -0.058088014
#> 0610009B22Rik 0.052085907
#> 0610009D07Rik 0.022854681
#> 0610009E02Rik 0.373135350
# For a partial matrix
coex <- getGenesCoex(obj, genes = c("Satb2","Bcl11b","Fezf2"))
head(coex)
#> 6 x 3 Matrix of class "dgeMatrix"
#> Bcl11b Fezf2 Satb2
#> 0610007N19Rik -0.034567790 -0.0015980216 -0.11573795
#> 0610007P14Rik 0.031908335 0.0152429083 -0.01208885
#> 0610009B22Rik -0.016003248 0.0842185237 -0.03780104
#> 0610009D07Rik 0.002291428 0.0002109366 -0.04022418
#> 0610009E02Rik 0.059523012 0.0315206070 -0.02990082
#> 0610009L18Rik 0.018350626 0.0467297245 -0.01862180
COTAN
provides a way to establish genes’ clusters given some lists of markers
layersGenes = list(
"L1" = c("Reln", "Lhx5"),
"L2/3" = c("Satb2", "Cux1"),
"L4" = c("Rorb", "Sox5"),
"L5/6" = c("Bcl11b", "Fezf2"),
"Prog" = c("Vim", "Hes1")
)
c(gSpace, eigPlot, pcaClustersDF, treePlot) %<-%
establishGenesClusters(obj, groupMarkers = layersGenes,
numGenesPerMarker = 25, kCuts = 6)
#> Establishing gene clusters - START
#> Calculating gene coexpression space - START
#> Calculating S: START
#> Calculating S: DONE
#> calculating PValues: START
#> Get p-values on a set of genes on columns and genome wide on rows
#> calculating PValues: DONE
#> Number of selected secondary markers: 182
#> Calculating S: START
#> Calculating S: DONE
#> Calculating gene coexpression space - DONE
#> Establishing gene clusters - DONE
plot(eigPlot)
plot(treePlot)
UMAPPlot(pcaClustersDF[, 1:10],
clusters = pcaClustersDF[["hclust"]],
elements = layersGenes,
title = "Genes' clusters UMAP Plot")
#> UMAP plot
It is possible to obtain a cell clusterization based on the concept of
uniformity of expression of the genes across the cells. That is the cluster
satisfies the null hypothesis of the COTAN
model:
the genes expression is not dependent on the cell in consideration.
fineClusters <- cellsUniformClustering(obj, GDIThreshold = 1.4, cores = 10,
saveObj = TRUE, outDir = outDir)
obj <- addClusterization(obj, clName = "FineClusters", clusters = fineClusters)
c(coexDF, pValueDF) %<-% DEAOnClusters(obj, clusters = fineClusters)
obj <- addClusterizationCoex(obj, clName = "FineClusters",
coexDF = coexDF)
c(mergedClusters, coexDF, pValueDF) %<-%
mergeUniformCellsClusters(obj, GDIThreshold = 1.4, cores = 10,
saveObj = TRUE, outDir = outDir)
obj <- addClusterization(obj, clName = "MergedClusters",
clusters = mergedClusters, coexDF = coexDF)
mergedUMAPPlot <- UMAPPlot(coexDF, elements = layersGenes,
title = "Fine Cluster UMAP Plot")
plot(mergedUMAPPlot)
The next few lines are just to clean.
if (file.exists(file.path(outDir, paste0(cond, ".cotan.RDS")))) {
#Delete file if it exists
file.remove(file.path(outDir, paste0(cond, ".cotan.RDS")))
}
#> [1] TRUE
unlink(file.path(outDir, cond), recursive = TRUE)
file.remove(dataSetFile)
#> [1] TRUE
# stop logging to file
setLoggingFile("")
#> Closing previous log file - Setting log file to be:
file.remove(file.path(outDir, "vignette_v1.log"))
#> [1] TRUE
options(parallelly.fork.enable = FALSE)
sessionInfo()
#> R version 4.3.1 (2023-06-16)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 22.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.17-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] GEOquery_2.68.0 Biobase_2.60.0 BiocGenerics_0.46.0
#> [4] qpdf_1.3.2 Rtsne_0.16 factoextra_1.0.7
#> [7] ggplot2_3.4.3 data.table_1.14.8 zeallot_0.1.0
#> [10] COTAN_2.0.5 BiocStyle_2.28.0
#>
#> loaded via a namespace (and not attached):
#> [1] RcppAnnoy_0.0.21 splines_4.3.1 later_1.3.1
#> [4] tibble_3.2.1 polyclip_1.10-4 lifecycle_1.0.3
#> [7] rstatix_0.7.2 doParallel_1.0.17 globals_0.16.2
#> [10] lattice_0.21-8 MASS_7.3-60 backports_1.4.1
#> [13] dendextend_1.17.1 magrittr_2.0.3 limma_3.56.2
#> [16] plotly_4.10.2 sass_0.4.7 rmarkdown_2.24
#> [19] jquerylib_0.1.4 yaml_2.3.7 httpuv_1.6.11
#> [22] Seurat_4.3.0.1 sctransform_0.3.5 askpass_1.1
#> [25] sp_2.0-0 spatstat.sparse_3.0-2 reticulate_1.31
#> [28] cowplot_1.1.1 pbapply_1.7-2 RColorBrewer_1.1-3
#> [31] abind_1.4-5 purrr_1.0.2 circlize_0.4.15
#> [34] IRanges_2.34.1 S4Vectors_0.38.1 ggrepel_0.9.3
#> [37] irlba_2.3.5.1 listenv_0.9.0 spatstat.utils_3.0-3
#> [40] umap_0.2.10.0 goftest_1.2-3 RSpectra_0.16-1
#> [43] spatstat.random_3.1-5 fitdistrplus_1.1-11 parallelly_1.36.0
#> [46] leiden_0.4.3 codetools_0.2-19 xml2_1.3.5
#> [49] tidyselect_1.2.0 shape_1.4.6 farver_2.1.1
#> [52] viridis_0.6.4 matrixStats_1.0.0 stats4_4.3.1
#> [55] spatstat.explore_3.2-1 jsonlite_1.8.7 GetoptLong_1.0.5
#> [58] ellipsis_0.3.2 progressr_0.14.0 ggridges_0.5.4
#> [61] survival_3.5-7 iterators_1.0.14 foreach_1.5.2
#> [64] tools_4.3.1 ica_1.0-3 Rcpp_1.0.11
#> [67] glue_1.6.2 gridExtra_2.3 xfun_0.40
#> [70] ggthemes_4.2.4 dplyr_1.1.2 withr_2.5.0
#> [73] BiocManager_1.30.22 fastmap_1.1.1 fansi_1.0.4
#> [76] openssl_2.1.0 digest_0.6.33 R6_2.5.1
#> [79] mime_0.12 colorspace_2.1-0 scattermore_1.2
#> [82] Cairo_1.6-1 tensor_1.5 spatstat.data_3.0-1
#> [85] utf8_1.2.3 tidyr_1.3.0 generics_0.1.3
#> [88] httr_1.4.7 htmlwidgets_1.6.2 uwot_0.1.16
#> [91] pkgconfig_2.0.3 gtable_0.3.4 ComplexHeatmap_2.16.0
#> [94] lmtest_0.9-40 htmltools_0.5.6 carData_3.0-5
#> [97] bookdown_0.35 clue_0.3-64 SeuratObject_4.1.3
#> [100] scales_1.2.1 png_0.1-8 knitr_1.43
#> [103] tzdb_0.4.0 reshape2_1.4.4 rjson_0.2.21
#> [106] nlme_3.1-163 curl_5.0.2 cachem_1.0.8
#> [109] zoo_1.8-12 GlobalOptions_0.1.2 stringr_1.5.0
#> [112] KernSmooth_2.23-22 parallel_4.3.1 miniUI_0.1.1.1
#> [115] RcppZiggurat_0.1.6 pillar_1.9.0 grid_4.3.1
#> [118] vctrs_0.6.3 RANN_2.6.1 promises_1.2.1
#> [121] ggpubr_0.6.0 car_3.1-2 xtable_1.8-4
#> [124] cluster_2.1.4 evaluate_0.21 readr_2.1.4
#> [127] magick_2.7.5 cli_3.6.1 compiler_4.3.1
#> [130] rlang_1.1.1 crayon_1.5.2 future.apply_1.11.0
#> [133] ggsignif_0.6.4 labeling_0.4.2 plyr_1.8.8
#> [136] stringi_1.7.12 viridisLite_0.4.2 deldir_1.0-9
#> [139] assertthat_0.2.1 munsell_0.5.0 lazyeval_0.2.2
#> [142] spatstat.geom_3.2-4 Matrix_1.6-1 hms_1.1.3
#> [145] patchwork_1.1.3 future_1.33.0 shiny_1.7.5
#> [148] highr_0.10 ROCR_1.0-11 Rfast_2.0.8
#> [151] broom_1.0.5 igraph_1.5.1 bslib_0.5.1