Contents

1 Introduction

miaSim implements tools for microbiome data simulation based on different ecological modeling assumptions. These can be used to simulate species abundance matrices, including time series. For a detailed function documentation, see the function reference page

Install the Bioconductor release version with

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

Load the library

library(miaSim)

2 Case study 3: Environmental complexity impacts on different communities

(Reference: Non-additive microbial community responses to environmental complexity)

The aim of this case study is to recalculate the patterns shown in the existing publication in the above reference. More specifically, the left part of this Figure 2. Figure 2

In this case study, consumer-resource model was used. The number of species in 3 distinct communities varied so that communities are named after the number of species (i.e. com13, com3, and com4).

For each community, a floor of different numbers of carbon resources (nutrients) is set from 1, 2, 4, 8, 16, to 32. For demo purposes we include here a smaller number of cases.

In the original experiment, value of OD600 is selected as y-axis to represent the growth yield. In our simulations, however, the total number of organisms(number of all individuals) is used to reflect the growth yield.

2.1 Load dependencies

library(ggplot2)
library(miaSim)
library(vegan)
library(ggplot2)
library(reshape2)

2.2 Set random seed and number of repeats

set.seed(42)
# n_rep <- 50 # Full (and slow) example
n_rep <- 5 # Speed up demo example

2.3 initial the output dataframes to store data

result_df <- data.frame(
    n_species = integer(),
    theta = numeric(),
    i = integer(),
    n_resources = integer(),
    value = numeric()
)
result_df2 <- data.frame(
  matrix(NA, nrow = 0, ncol = 13, dimnames = list(NULL, paste0("sp", seq_len(13))))
)
sorensen_df <- data.frame(
    n_species = integer(),
    theta = numeric(),
    rho_mean = numeric(),
    rho_sd = numeric()
)

2.4 generating function

this function generates a data frame, where each row is arranged in an increasing dissimilarity to the first row.

gradient_df_generator <- function(n_row, n_col, density_row, max_gradient, error_interval){
  list_initial <- list()
  dissimilarity.gradient <- seq(from = 0, to = max_gradient, length.out = n_row)
  for (i in seq_len(n_row)){
    print(i)
    if (i == 1){
      row_temp <- rbeta(n_col, 1, n_col)
      col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
      row_temp[col_to_remove] <- 0
      list_initial[[i]] <- row_temp
    } else {
      while (length(list_initial) < i) {
        row_temp <- rbeta(n_col, 1, n_col)
        col_to_remove <- sample(x = seq_len(n_col), size = n_col-n_col*density_row)
        row_temp[col_to_remove] <- 0
        diff_temp <- abs(vegdist(rbind(list_initial[[1]], row_temp), method = "bray") - dissimilarity.gradient[i])
        if (diff_temp < error_interval) {
          list_initial[[i]] <- row_temp
        }
      }
    }
  }

  # Return
  as.data.frame(t(matrix(unlist(list_initial), ncol = n_row)))

}

2.5 Load parameters

Load parameters used by Pacheco et al., and initialized the community data frame

Note that in this step, the value range of theta has been extended.

# Full example
#n_species_types <- c(13, 3, 4)
#theta_types <- c(1, 0.75, 0.5, 0.25, 0.1, 0.05)
#n_resources_types <- c(1,2,4,8,16,32)

# Speeding up the example by reducing the number of combinations
n_species_types <- c(3, 4)
theta_types <- c(1, 0.5)
n_resources_types <- c(1,2)

community.initial.df <- as.list(
  lapply(n_species_types, 
         gradient_df_generator, 
         n_row = n_rep, 
         density_row = 1,
         max_gradient = 0.7,
         error_interval = 0.15)
)

2.6 Loop over different combinations of (number of species X theta X number of repetations X number of resources)

n_species_types <- c(3, 4)
for (n_species in n_species_types){
    for (theta in theta_types) {
        sorensen <- c()
        for (i in seq_len(n_rep)){
            for (n_resources in n_resources_types) {
                ### generate E ####
                Etest <- randomE(n_species = n_species,
                                 n_resources = n_resources,
                                 mean_consumption = theta*n_resources,
                                 exact = TRUE)

                ### calculate rho ####
                if (n_resources == max(n_resources_types)){
                    Etest_pos <- Etest
                    Etest_pos[Etest_pos<0] <- 0
                    for (j in seq_len(n_species - 1)){
                        for (k in 2:n_species){
                            sorensen <- c(sorensen,
                                          sum(apply(Etest_pos[c(j,k),], 2, min)))
                        }
                    }
                }

                if (n_resources > 1){
                    Priority <- t(apply(matrix(sample(n_species * n_resources), nrow = n_species), 1, order))
                    Priority <- (Etest > 0) * Priority
                } else {
                    Priority <- NULL
                }

                print(paste0("n_species=",n_species, " theta=",theta, " i=", i, " n_resources=", n_resources))
                x0temp <- as.numeric(community.initial.df[[match(n_species, n_species_types)]][i,])
                x0temp <- 10*x0temp/sum(x0temp)
                CRMtest <- simulateConsumerResource(n_species = n_species,
                                                    n_resources = n_resources,
                                                    x0 = x0temp, #rep(10, n_species),
                                                    resources = rep(100, n_resources),
                                                    E = Etest,
                                                    # trophic_priority = Priority,
                                                    stochastic = TRUE,
                                                    t_end = 1000,
                                                    t_step = 1,
                                                    t_store = 1000)
                CRMspecies <- assay(CRMtest, "counts")[, ncol(CRMtest)] # last time point; was getCommunity(CRMtest)
                CRMspeciesTotal <- sum(CRMspecies)
                result_df[nrow(result_df)+1,] <- c(n_species, theta, i, n_resources, CRMspeciesTotal)
                result_df2[nrow(result_df2)+1,] <- c(CRMspecies, rep(NA, 13-length(CRMspecies)))

            }
        }
        rho_mean <- mean(sorensen)
        rho_sd <- var(sorensen)
        sorensen_df[nrow(sorensen_df)+1, ] <- c(n_species, theta, rho_mean, rho_sd)
    }
}
n_species_types <- c(3, 4)
p <- ggplot(result_df, aes(x = n_resources, y = value, group = n_resources)) +
    geom_boxplot(outlier.shape = NA) +
    geom_jitter(alpha = 0.2, width = 0.2) +
    facet_grid(. ~ factor(n_species, levels = n_species_types)) +
    theme_bw() +
    scale_x_continuous(trans = "log2", breaks = n_resources_types) +
    labs(x = "number of resources",
         y = "growth yield (number of individuals)")
print(p)