knitr::opts_chunk$set(message = FALSE, warning = FALSE, comment = NA,
fig.width = 6.25, fig.height = 5)
library(ANCOMBC)
library(tidyverse)
Sparse Estimation of Correlations among Microbiomes (SECOM) (Lin, Eggesbø, and Peddada 2022) is a methodology that aims to detect both linear and nonlinear relationships between a pair of taxa within an ecosystem (e.g., gut) or across ecosystems (e.g., gut and tongue). SECOM corrects both sample-specific and taxon-specific biases and obtains a consistent estimator for the correlation matrix of microbial absolute abundances while maintaining the underlying true sparsity. For more details, please refer to the SECOM paper.
Download package.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ANCOMBC")
Load the package.
The HITChip Atlas dataset contains genus-level microbiota profiling with HITChip for 1006 western adults with no reported health complications, reported in (Lahti et al. 2014). The dataset is available via the microbiome R package (Lahti et al. 2017) in phyloseq (McMurdie and Holmes 2013) format.
data(atlas1006, package = "microbiome")
tse = mia::makeTreeSummarizedExperimentFromPhyloseq(atlas1006)
# subset to baseline
tse = tse[, tse$time == 0]
# Re-code the bmi group
tse$bmi = recode(tse$bmi_group,
obese = "obese",
severeobese = "obese",
morbidobese = "obese")
# Subset to lean, overweight, and obese subjects
tse = tse[, tse$bmi %in% c("lean", "overweight", "obese")]
# Create the region variable
tse$region = recode(as.character(tse$nationality),
Scandinavia = "NE", UKIE = "NE", SouthEurope = "SE",
CentralEurope = "CE", EasternEurope = "EE",
.missing = "unknown")
# Discard "EE" as it contains only 1 subject
# Discard subjects with missing values of region
tse = tse[, ! tse$region %in% c("EE", "unknown")]
print(tse)
class: TreeSummarizedExperiment
dim: 130 873
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(873): Sample-1 Sample-2 ... Sample-1005 Sample-1006
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(tse), assay_name = "counts",
tax_level = "Phylum", pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), method = "pearson",
soft = FALSE, thresh_len = 20, n_cv = 10,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
# Nonlinear relationships
res_dist = secom_dist(data = list(tse), assay_name = "counts",
tax_level = "Phylum", pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), R = 1000,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
heat_linear_th = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_th
corr_linear = res_linear$corr_fl
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
heat_linear_fl = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_fl
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(value = round(value, 2))
tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)
heat_dist_fl = df_dist %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", na.value = "grey",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic"),
axis.text.y = element_text(size = 12, face = "italic"),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_dist_fl
To compute correlations whithin and across different ecosystems, one needs to make sure that there are samples in common across these ecosystems.
# Select subjects from "CE" and "NE"
tse1 = tse[, tse$region == "CE"]
tse2 = tse[, tse$region == "NE"]
# Rename samples to ensure there is an overlap of samples between CE and NE
colnames(tse1) = paste0("Sample-", seq_len(ncol(tse1)))
colnames(tse2) = paste0("Sample-", seq_len(ncol(tse2)))
print(tse1)
class: TreeSummarizedExperiment
dim: 130 578
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(578): Sample-1 Sample-2 ... Sample-577 Sample-578
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
class: TreeSummarizedExperiment
dim: 130 181
metadata(0):
assays(1): counts
rownames(130): Actinomycetaceae Aerococcus ... Xanthomonadaceae
Yersinia et rel.
rowData names(3): Phylum Family Genus
colnames(181): Sample-1 Sample-2 ... Sample-180 Sample-181
colData names(12): age sex ... bmi region
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):
rowLinks: NULL
rowTree: NULL
colLinks: NULL
colTree: NULL
set.seed(123)
# Linear relationships
res_linear = secom_linear(data = list(CE = tse1, NE = tse2),
assay_name = c("counts", "counts"),
tax_level = c("Phylum", "Phylum"), pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), method = "pearson",
soft = FALSE, thresh_len = 20, n_cv = 10,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
# Nonlinear relationships
res_dist = secom_dist(data = list(CE = tse1, NE = tse2),
assay_name = c("counts", "counts"),
tax_level = c("Phylum", "Phylum"), pseudo = 0,
prv_cut = 0.5, lib_cut = 1000, corr_cut = 0.5,
wins_quant = c(0.05, 0.95), R = 1000,
thresh_hard = 0.3, max_p = 0.005, n_cl = 2)
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_linear_th = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Thresholding)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_th
corr_linear = res_linear$corr_th
cooccur_linear = res_linear$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_linear[cooccur_linear < overlap] = 0
df_linear = data.frame(get_upper_tri(corr_linear)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_linear$var1, df_linear$var2))
df_linear$var1 = factor(df_linear$var1, levels = tax_name)
df_linear$var2 = factor(df_linear$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_linear_fl = df_linear %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Pearson (Filtering)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_linear_fl
corr_dist = res_dist$dcorr_fl
cooccur_dist = res_dist$mat_cooccur
# Filter by co-occurrence
overlap = 10
corr_dist[cooccur_dist < overlap] = 0
df_dist = data.frame(get_upper_tri(corr_dist)) %>%
rownames_to_column("var1") %>%
pivot_longer(cols = -var1, names_to = "var2", values_to = "value") %>%
filter(!is.na(value)) %>%
mutate(var2 = gsub("\\...", " - ", var2),
value = round(value, 2))
tax_name = sort(union(df_dist$var1, df_dist$var2))
df_dist$var1 = factor(df_dist$var1, levels = tax_name)
df_dist$var2 = factor(df_dist$var2, levels = tax_name)
txt_color = ifelse(grepl("CE", tax_name), "#1B9E77", "#D95F02")
heat_dist_fl = df_dist %>%
ggplot(aes(var2, var1, fill = value)) +
geom_tile(color = "black") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
na.value = "grey", midpoint = 0, limit = c(-1,1),
space = "Lab", name = NULL) +
scale_x_discrete(drop = FALSE) +
scale_y_discrete(drop = FALSE) +
geom_text(aes(var2, var1, label = value), color = "black", size = 4) +
labs(x = NULL, y = NULL, title = "Distance (Filtering)") +
theme_bw() +
geom_vline(xintercept = 6.5, color = "blue", linetype = "dashed") +
geom_hline(yintercept = 6.5, color = "blue", linetype = "dashed") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1,
face = "italic", color = txt_color),
axis.text.y = element_text(size = 12, face = "italic",
color = txt_color),
strip.text.x = element_text(size = 14),
strip.text.y = element_text(size = 14),
legend.text = element_text(size = 12),
plot.title = element_text(hjust = 0.5, size = 15),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
coord_fixed()
heat_dist_fl
R version 4.4.0 beta (2024-04-15 r86425)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.19-bioc/R/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/New_York
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] doRNG_1.8.6 rngtools_1.5.2 foreach_1.5.2 DT_0.33
[5] phyloseq_1.48.0 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.1
[9] dplyr_1.1.4 purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
[13] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0 ANCOMBC_2.6.0
loaded via a namespace (and not attached):
[1] splines_4.4.0 cellranger_1.1.0
[3] rpart_4.1.23 DirichletMultinomial_1.46.0
[5] lifecycle_1.0.4 Rdpack_2.6
[7] doParallel_1.0.17 lattice_0.22-6
[9] MASS_7.3-60.2 crosstalk_1.2.1
[11] MultiAssayExperiment_1.30.0 backports_1.4.1
[13] magrittr_2.0.3 Hmisc_5.1-2
[15] sass_0.4.9 rmarkdown_2.26
[17] jquerylib_0.1.4 yaml_2.3.8
[19] gld_2.6.6 DBI_1.2.2
[21] minqa_1.2.6 ade4_1.7-22
[23] multcomp_1.4-25 abind_1.4-5
[25] zlibbioc_1.50.0 expm_0.999-9
[27] GenomicRanges_1.56.0 BiocGenerics_0.50.0
[29] yulab.utils_0.1.4 nnet_7.3-19
[31] TH.data_1.1-2 sandwich_3.1-0
[33] GenomeInfoDbData_1.2.12 IRanges_2.38.0
[35] S4Vectors_0.42.0 ggrepel_0.9.5
[37] irlba_2.3.5.1 tidytree_0.4.6
[39] vegan_2.6-4 permute_0.9-7
[41] DelayedMatrixStats_1.26.0 codetools_0.2-20
[43] DelayedArray_0.30.0 scuttle_1.14.0
[45] energy_1.7-11 tidyselect_1.2.1
[47] farver_2.1.1 UCSC.utils_1.0.0
[49] lme4_1.1-35.3 gmp_0.7-4
[51] ScaledMatrix_1.12.0 viridis_0.6.5
[53] matrixStats_1.3.0 stats4_4.4.0
[55] base64enc_0.1-3 jsonlite_1.8.8
[57] multtest_2.60.0 BiocNeighbors_1.22.0
[59] e1071_1.7-14 decontam_1.24.0
[61] mia_1.12.0 Formula_1.2-5
[63] survival_3.6-4 scater_1.32.0
[65] iterators_1.0.14 tools_4.4.0
[67] treeio_1.28.0 DescTools_0.99.54
[69] Rcpp_1.0.12 glue_1.7.0
[71] gridExtra_2.3 SparseArray_1.4.0
[73] xfun_0.43 mgcv_1.9-1
[75] MatrixGenerics_1.16.0 GenomeInfoDb_1.40.0
[77] TreeSummarizedExperiment_2.12.0 withr_3.0.0
[79] numDeriv_2016.8-1.1 fastmap_1.1.1
[81] rhdf5filters_1.16.0 boot_1.3-30
[83] bluster_1.14.0 fansi_1.0.6
[85] digest_0.6.35 rsvd_1.0.5
[87] timechange_0.3.0 R6_2.5.1
[89] colorspace_2.1-0 gtools_3.9.5
[91] utf8_1.2.4 generics_0.1.3
[93] data.table_1.15.4 DECIPHER_3.0.0
[95] class_7.3-22 CVXR_1.0-12
[97] httr_1.4.7 htmlwidgets_1.6.4
[99] S4Arrays_1.4.0 pkgconfig_2.0.3
[101] gtable_0.3.5 Exact_3.2
[103] Rmpfr_0.9-5 SingleCellExperiment_1.26.0
[105] XVector_0.44.0 htmltools_0.5.8.1
[107] biomformat_1.32.0 scales_1.3.0
[109] Biobase_2.64.0 lmom_3.0
[111] knitr_1.46 rstudioapi_0.16.0
[113] tzdb_0.4.0 reshape2_1.4.4
[115] checkmate_2.3.1 nlme_3.1-164
[117] nloptr_2.0.3 rhdf5_2.48.0
[119] proxy_0.4-27 cachem_1.0.8
[121] zoo_1.8-12 rootSolve_1.8.2.4
[123] parallel_4.4.0 vipor_0.4.7
[125] foreign_0.8-86 pillar_1.9.0
[127] grid_4.4.0 vctrs_0.6.5
[129] BiocSingular_1.20.0 beachmat_2.20.0
[131] cluster_2.1.6 beeswarm_0.4.0
[133] htmlTable_2.4.2 evaluate_0.23
[135] mvtnorm_1.2-4 cli_3.6.2
[137] compiler_4.4.0 rlang_1.1.3
[139] crayon_1.5.2 labeling_0.4.3
[141] plyr_1.8.9 fs_1.6.4
[143] ggbeeswarm_0.7.2 stringi_1.8.3
[145] viridisLite_0.4.2 BiocParallel_1.38.0
[147] lmerTest_3.1-3 munsell_0.5.1
[149] Biostrings_2.72.0 gsl_2.1-8
[151] lazyeval_0.2.2 Matrix_1.7-0
[153] hms_1.1.3 sparseMatrixStats_1.16.0
[155] bit64_4.0.5 Rhdf5lib_1.26.0
[157] highr_0.10 SummarizedExperiment_1.34.0
[159] rbibutils_2.2.16 igraph_2.0.3
[161] memoise_2.0.1 bslib_0.7.0
[163] bit_4.0.5 readxl_1.4.3
[165] ape_5.8
Lahti, Leo, Jarkko Salojärvi, Anne Salonen, Marten Scheffer, and Willem M De Vos. 2014. “Tipping Elements in the Human Intestinal Ecosystem.” Nature Communications 5 (1): 1–10.
Lahti, Leo, Sudarshan Shetty, T Blake, J Salojarvi, and others. 2017. “Tools for Microbiome Analysis in R.” Version 1: 10013.
Lin, Huang, Merete Eggesbø, and Shyamal Das Peddada. 2022. “Linear and Nonlinear Correlation Estimators Unveil Undescribed Taxa Interactions in Microbiome Data.” Nature Communications 13 (1): 1–16.
McMurdie, Paul J, and Susan Holmes. 2013. “Phyloseq: An R Package for Reproducible Interactive Analysis and Graphics of Microbiome Census Data.” PloS One 8 (4): e61217.