## ---- echo=FALSE, results="asis", message=FALSE, KnitrSetUp------------------- library(knitr) knit_hooks$set(crop = hook_pdfcrop) knitr::opts_chunk$set(crop = TRUE, tidy=FALSE, warning=FALSE,message=FALSE, fig.align="center") Biocpkg <- function (pkg){ sprintf("[%s](http://bioconductor.org/packages/%s)", pkg, pkg) } CRANpkg <- function(pkg){ cran <- "https://CRAN.R-project.org/package" fmt <- "[%s](%s=%s)" sprintf(fmt, pkg, cran, pkg) } ## ---- echo=FALSE, results="hide", message=FALSE, Loadpackages----------------- library(ggplot2) library(phyloseq) library(shadowtext) library(ggtree) library(ggtreeExtra) library(treeio) library(tidytree) library(MicrobiotaProcess) ## ---- echo=FALSE, fig.width = 12, dpi=400, fig.align="center", fig.cap= "The structure of the MPSE class."---- knitr::include_graphics("./mpse.png") ## ---- echo=FALSE, fig.width = 12, dpi=400, fig.align="center", fig.cap="The Overview of the design of MicrobiotaProcess package"---- knitr::include_graphics("./mp-design.png") ## ---- warning=FALSE, message=FALSE-------------------------------------------- library(MicrobiotaProcess) #parsing the output of dada2 seqtabfile <- system.file("extdata", "seqtab.nochim.rds", package="MicrobiotaProcess") seqtab <- readRDS(seqtabfile) taxafile <- system.file("extdata", "taxa_tab.rds", package="MicrobiotaProcess") taxa <- readRDS(taxafile) # the seqtab and taxa are output of dada2 sampleda <- system.file("extdata", "mouse.time.dada2.txt", package="MicrobiotaProcess") mpse1 <- mp_import_dada2(seqtab=seqtab, taxatab=taxa, sampleda=sampleda) mpse1 # parsing the output of qiime2 otuqzafile <- system.file("extdata", "table.qza", package="MicrobiotaProcess") taxaqzafile <- system.file("extdata", "taxa.qza", package="MicrobiotaProcess") mapfile <- system.file("extdata", "metadata_qza.txt", package="MicrobiotaProcess") mpse2 <- mp_import_qiime2(otuqza=otuqzafile, taxaqza=taxaqzafile, mapfilename=mapfile) mpse2 # parsing the output of MetaPhlAn file1 <- system.file("extdata/MetaPhlAn", "metaphlan_test.txt", package="MicrobiotaProcess") sample.file <- system.file("extdata/MetaPhlAn", "sample_test.txt", package="MicrobiotaProcess") mpse3 <- mp_import_metaphlan(profile=file1, mapfilename=sample.file) mpse3 # convert phyloseq object to mpse #library(phyloseq) #data(esophagus) #esophagus #mpse4 <- esophagus %>% as.MPSE() #mpse4 # convert TreeSummarizedExperiment object to mpse # library(curatedMetagenomicData) # tse <- curatedMetagenomicData::curatedMetagenomicData("ZhuF_2020.relative_abundance", dryrun=F) # tse[[1]] %>% as.MPSE() -> mpse5 # mpse5 ## ---- fig.align="center", fig.width=12, fig.height=4, fig.cap="The rarefaction of samples or groups"---- library(ggplot2) library(MicrobiotaProcess) data(mouse.time.mpse) mouse.time.mpse # Rarefied species richness mouse.time.mpse %<>% mp_rrarefy() # 'chunks' represent the split number of each sample to calculate alpha # diversity, default is 400. e.g. If a sample has total 40000 # reads, if chunks is 400, it will be split to 100 sub-samples # (100, 200, 300,..., 40000), then alpha diversity index was # calculated based on the sub-samples. # '.abundance' the column name of abundance, if the '.abundance' is not be # rarefied calculate rarecurve, user can specific 'force=TRUE'. mouse.time.mpse %<>% mp_cal_rarecurve( .abundance = RareAbundance, chunks = 400 ) # The RareAbundanceRarecurve column will be added the colData slot # automatically (default action="add") mouse.time.mpse %>% print(width=180) # default will display the confidence interval around smooth. # se=TRUE p1 <- mouse.time.mpse %>% mp_plot_rarecurve( .rare = RareAbundanceRarecurve, .alpha = Observe, ) p2 <- mouse.time.mpse %>% mp_plot_rarecurve( .rare = RareAbundanceRarecurve, .alpha = Observe, .group = time ) + scale_color_manual(values=c("#00A087FF", "#3C5488FF")) + scale_fill_manual(values=c("#00A087FF", "#3C5488FF"), guide="none") # combine the samples belong to the same groups if # plot.group=TRUE p3 <- mouse.time.mpse %>% mp_plot_rarecurve( .rare = RareAbundanceRarecurve, .alpha = "Observe", .group = time, plot.group = TRUE ) + scale_color_manual(values=c("#00A087FF", "#3C5488FF")) + scale_fill_manual(values=c("#00A087FF", "#3C5488FF"),guide="none") p1 + p2 + p3 # Users can extract the result with mp_extract_rarecurve to extract the result of mp_cal_rarecurve # and visualized the result manually. ## ---- fig.width=7, fig.height=7, fig.align="center", message=FALSE, fig.cap="The alpha diversity comparison"---- library(ggplot2) library(MicrobiotaProcess) mouse.time.mpse %<>% mp_cal_alpha(.abundance=RareAbundance) mouse.time.mpse f1 <- mouse.time.mpse %>% mp_plot_alpha( .group=time, .alpha=c(Observe, Chao1, ACE, Shannon, Simpson, Pielou) ) + scale_fill_manual(values=c("#00A087FF", "#3C5488FF"), guide="none") + scale_color_manual(values=c("#00A087FF", "#3C5488FF"), guide="none") f2 <- mouse.time.mpse %>% mp_plot_alpha( .alpha=c(Observe, Chao1, ACE, Shannon, Simpson, Pielou) ) f1 / f2 ## ---- fig.align="center", fig.width=8, fig.height=8, fig.cap="The relative abundance and abundance of phyla of all samples "---- mouse.time.mpse mouse.time.mpse %<>% mp_cal_abundance( # for each samples .abundance = RareAbundance ) %>% mp_cal_abundance( # for each groups .abundance=RareAbundance, .group=time ) mouse.time.mpse # visualize the relative abundance of top 20 phyla for each sample. p1 <- mouse.time.mpse %>% mp_plot_abundance( .abundance=RareAbundance, .group=time, taxa.class = Phylum, topn = 20, relative = TRUE ) # visualize the abundance (rarefied) of top 20 phyla for each sample. p2 <- mouse.time.mpse %>% mp_plot_abundance( .abundance=RareAbundance, .group=time, taxa.class = Phylum, topn = 20, relative = FALSE ) p1 / p2 ## ---- fig.align="center", fig.width=12, fig.height=4, fig.cap="The relative abundance and abundance of phyla of all samples "---- h1 <- mouse.time.mpse %>% mp_plot_abundance( .abundance = RareAbundance, .group = time, taxa.class = Phylum, relative = TRUE, topn = 20, geom = 'heatmap', features.dist = 'euclidean', features.hclust = 'average', sample.dist = 'bray', sample.hclust = 'average' ) h2 <- mouse.time.mpse %>% mp_plot_abundance( .abundance = RareAbundance, .group = time, taxa.class = Phylum, relative = FALSE, topn = 20, geom = 'heatmap', features.dist = 'euclidean', features.hclust = 'average', sample.dist = 'bray', sample.hclust = 'average' ) # the character (scale or theme) of figure can be adjusted by set_scale_theme # refer to the mp_plot_dist aplot::plot_list(gglist=list(h1, h2), tag_levels="A") ## ---- fig.align="center", fig.width=8, fig.height=8, fig.cap="The relative abundance and abundance of phyla of groups "---- # visualize the relative abundance of top 20 phyla for each .group (time) p3 <- mouse.time.mpse %>% mp_plot_abundance( .abundance=RareAbundance, .group=time, taxa.class = Phylum, topn = 20, plot.group = TRUE ) # visualize the abundance of top 20 phyla for each .group (time) p4 <- mouse.time.mpse %>% mp_plot_abundance( .abundance=RareAbundance, .group= time, taxa.class = Phylum, topn = 20, relative = FALSE, plot.group = TRUE ) p3 / p4 ## ---- fig.align="center", fig.width=5, fig.height=4.6, fig.cap='the distance between samples'---- # standardization # mp_decostand wraps the decostand of vegan, which provides # many standardization methods for community ecology. # default is hellinger, then the abundance processed will # be stored to the assays slot. mouse.time.mpse %<>% mp_decostand(.abundance=Abundance) mouse.time.mpse # calculate the distance between the samples. # the distance will be generated a nested tibble and added to the # colData slot. mouse.time.mpse %<>% mp_cal_dist(.abundance=hellinger, distmethod="bray") mouse.time.mpse # mp_plot_dist provides there methods to visualize the distance between the samples or groups # when .group is not provided, the dot heatmap plot will be return p1 <- mouse.time.mpse %>% mp_plot_dist(.distmethod = bray) p1 ## ---- fig.align="center", fig.width= 5, fig.height = 4.4, fig.cap = "The distance between samples with group information"---- # when .group is provided, the dot heatmap plot with group information will be return. p2 <- mouse.time.mpse %>% mp_plot_dist(.distmethod = bray, .group = time) # The scale or theme of dot heatmap plot can be adjusted using set_scale_theme function. p2 %>% set_scale_theme( x = scale_fill_manual( values=c("orange", "deepskyblue"), guide = guide_legend( keywidth = 1, keyheight = 0.5, title.theme = element_text(size=8), label.theme = element_text(size=6) ) ), aes_var = time # specific the name of variable ) %>% set_scale_theme( x = scale_color_gradient( guide = guide_legend(keywidth = 0.5, keyheight = 0.5) ), aes_var = bray ) %>% set_scale_theme( x = scale_size_continuous( range = c(0.1, 3), guide = guide_legend(keywidth = 0.5, keyheight = 0.5) ), aes_var = bray ) ## ---- fig.align="center", fig.width=2, fig.height=4, fig.cap = "The comparison of distance among the groups"---- # when .group is provided and group.test is TRUE, the comparison of different groups will be returned p3 <- mouse.time.mpse %>% mp_plot_dist(.distmethod = bray, .group = time, group.test=TRUE, textsize=2) p3 ## ---- fig.width=10, fig.height=4, fig.align="center", fig.cap="The PCoA result"---- mouse.time.mpse %<>% mp_cal_pcoa(.abundance=hellinger, distmethod="bray") # The dimensions of ordination analysis will be added the colData slot (default). mouse.time.mpse # We also can perform adonis or anosim to check whether it is significant to the dissimilarities of groups. mouse.time.mpse %<>% mp_adonis(.abundance=hellinger, .formula=~time, distmethod="bray", permutations=9999, action="add") mouse.time.mpse %>% mp_extract_internal_attr(name=adonis) library(ggplot2) p1 <- mouse.time.mpse %>% mp_plot_ord( .ord = pcoa, .group = time, .color = time, .size = 1.2, .alpha = 1, ellipse = TRUE, show.legend = FALSE # don't display the legend of stat_ellipse ) + scale_fill_manual(values=c("#00A087FF", "#3C5488FF")) + scale_color_manual(values=c("#00A087FF", "#3C5488FF")) # The size of point also can be mapped to other variables such as Observe, or Shannon # Then the alpha diversity and beta diversity will be displayed simultaneously. p2 <- mouse.time.mpse %>% mp_plot_ord( .ord = pcoa, .group = time, .color = time, .size = Observe, .alpha = Shannon, ellipse = TRUE, show.legend = FALSE # don't display the legend of stat_ellipse ) + scale_fill_manual( values = c("#00A087FF", "#3C5488FF"), guide = guide_legend(keywidth=0.6, keyheight=0.6, label.theme=element_text(size=6.5)) ) + scale_color_manual( values=c("#00A087FF", "#3C5488FF"), guide = guide_legend(keywidth=0.6, keyheight=0.6, label.theme=element_text(size=6.5)) ) + scale_size_continuous( range=c(0.5, 3), guide = guide_legend(keywidth=0.6, keyheight=0.6, label.theme=element_text(size=6.5)) ) p1 + p2 ## ---- fig.width=5, fig.height=5, fig.align="center", fig.cap="The hierarchical cluster result of samples"---- mouse.time.mpse %<>% mp_cal_clust( .abundance = hellinger, distmethod = "bray", hclustmethod = "average", # (UPGAE) action = "add" # action is used to control which result will be returned ) mouse.time.mpse # if action = 'add', the result of hierarchical cluster will be added to the MPSE object # mp_extract_internal_attr can extract it. It is a treedata object, so it can be visualized # by ggtree. sample.clust <- mouse.time.mpse %>% mp_extract_internal_attr(name='SampleClust') sample.clust library(ggtree) p <- ggtree(sample.clust) + geom_tippoint(aes(color=time)) + geom_tiplab(as_ylab = TRUE) + ggplot2::scale_x_continuous(expand=c(0, 0.01)) p ## ---- fig.width=10, fig.height=6, fig.align="center", fig.cap = "The hierarchical cluster result of samples and the abundance of Phylum"---- library(ggtreeExtra) library(ggplot2) phyla.tb <- mouse.time.mpse %>% mp_extract_abundance(taxa.class=Phylum, topn=30) # The abundance of each samples is nested, it can be flatted using the unnest of tidyr. phyla.tb %<>% tidyr::unnest(cols=RareAbundanceBySample) %>% dplyr::rename(Phyla="label") phyla.tb p1 <- p + geom_fruit( data=phyla.tb, geom=geom_col, mapping = aes(x = RelRareAbundanceBySample, y = Sample, fill = Phyla ), orientation = "y", #offset = 0.4, pwidth = 3, axis.params = list(axis = "x", title = "The relative abundance of phyla (%)", title.size = 4, text.size = 2, vjust = 1), grid.params = list() ) p1 ## ---- fig.width=10, fig.height=10, fig.align="center", fig.cap="The different species and the abundance of sample"---- library(ggtree) library(ggtreeExtra) library(ggplot2) library(MicrobiotaProcess) library(tidytree) library(ggstar) library(forcats) mouse.time.mpse %>% print(width=150) mouse.time.mpse %<>% mp_diff_analysis( .abundance = RelRareAbundanceBySample, .group = time, first.test.alpha = 0.01 ) # The result is stored to the taxatree or otutree slot, you can use mp_extract_tree to extract the specific slot. taxa.tree <- mouse.time.mpse %>% mp_extract_tree(type="taxatree") taxa.tree # And the result tibble of different analysis can also be extracted # with tidytree (>=0.3.5) taxa.tree %>% select(label, nodeClass, LDAupper, LDAmean, LDAlower, Sign_time, pvalue, fdr) %>% dplyr::filter(!is.na(fdr)) # Since taxa.tree is treedata object, it can be visualized by ggtree and ggtreeExtra p1 <- ggtree( taxa.tree, layout="radial", size = 0.3 ) + geom_point( data = td_filter(!isTip), fill="white", size=1, shape=21 ) # display the high light of phylum clade. p2 <- p1 + geom_hilight( data = td_filter(nodeClass == "Phylum"), mapping = aes(node = node, fill = label) ) # display the relative abundance of features(OTU) p3 <- p2 + ggnewscale::new_scale("fill") + geom_fruit( data = td_unnest(RareAbundanceBySample), geom = geom_star, mapping = aes( x = fct_reorder(Sample, time, .fun=min), size = RelRareAbundanceBySample, fill = time, subset = RelRareAbundanceBySample > 0 ), starshape = 13, starstroke = 0.25, offset = 0.04, pwidth = 0.8, grid.params = list(linetype=2) ) + scale_size_continuous( name="Relative Abundance (%)", range = c(.5, 3) ) + scale_fill_manual(values=c("#1B9E77", "#D95F02")) # display the tip labels of taxa tree p4 <- p3 + geom_tiplab(size=2, offset=7.2) # display the LDA of significant OTU. p5 <- p4 + ggnewscale::new_scale("fill") + geom_fruit( geom = geom_col, mapping = aes( x = LDAmean, fill = Sign_time, subset = !is.na(LDAmean) ), orientation = "y", offset = 0.3, pwidth = 0.5, axis.params = list(axis = "x", title = "Log10(LDA)", title.height = 0.01, title.size = 2, text.size = 1.8, vjust = 1), grid.params = list(linetype = 2) ) # display the significant (FDR) taxonomy after kruskal.test (default) p6 <- p5 + ggnewscale::new_scale("size") + geom_point( data=td_filter(!is.na(Sign_time)), mapping = aes(size = -log10(fdr), fill = Sign_time, ), shape = 21, ) + scale_size_continuous(range=c(1, 3)) + scale_fill_manual(values=c("#1B9E77", "#D95F02")) p6 + theme( legend.key.height = unit(0.3, "cm"), legend.key.width = unit(0.3, "cm"), legend.spacing.y = unit(0.02, "cm"), legend.text = element_text(size = 7), legend.title = element_text(size = 9), ) ## ---- fig.width=9, fig.height=9, fig.align="center", fig.cap="The different species and the abundance of group"---- # this object has contained the result of mp_diff_analysis mouse.time.mpse p <- mouse.time.mpse %>% mp_plot_diff_res( group.abun = TRUE, pwidth.abun=0.1 ) + scale_fill_manual(values=c("deepskyblue", "orange")) + scale_fill_manual( aesthetics = "fill_new", # The fill aes was renamed to "fill_new" for the abundance dotplot layer values = c("deepskyblue", "orange") ) + scale_fill_manual( aesthetics = "fill_new_new", # The fill aes for hight light layer of tree was renamed to 'fill_new_new' values = c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00", "#FFFF33", "#A65628", "#F781BF", "#999999" ) ) p ## ---- fig.width = 9, fig.height=9, fig.align="center", fig.cap="The cladogram of differential species "---- f <- mouse.time.mpse %>% mp_plot_diff_cladogram( label.size = 2.5, hilight.alpha = .3, bg.tree.size = .5, bg.point.size = 2, bg.point.stroke = .25 ) + scale_fill_diff_cladogram( # set the color of different group. values = c('deepskyblue', 'orange') ) + scale_size_continuous(range = c(1, 4)) f ## ---- fig.width = 12, fig.height = 5, fig.align = 'center', fig.cap = 'The abundance and LDA effect size of differential taxa'---- f.box <- mouse.time.mpse %>% mp_plot_diff_boxplot( .group = time, ) %>% set_diff_boxplot_color( values = c("deepskyblue", "orange"), guide = guide_legend(title=NULL) ) f.bar <- mouse.time.mpse %>% mp_plot_diff_boxplot( taxa.class = c(Genus, OTU), # select the taxonomy level to display group.abun = TRUE, # display the mean abundance of each group removeUnknown = TRUE, # whether mask the unknown taxa. ) %>% set_diff_boxplot_color( values = c("deepskyblue", "orange"), guide = guide_legend(title=NULL) ) aplot::plot_list(f.box, f.bar) ## ---- fig.width = 10, fig.height = 4.5, fig.align = 'center', fig.cap = 'The mahattan plot of differential taxa'---- f.mahattan <- mouse.time.mpse %>% mp_plot_diff_manhattan( .group = Sign_time, .y = fdr, .size = 2.4, taxa.class = c('OTU', 'Genus'), anno.taxa.class = Phylum ) f.mahattan ## ---- echo=FALSE-------------------------------------------------------------- sessionInfo()