## ----dontrun-basics-vignette, eval=FALSE--------------------------------------
#  vignette("phyloseq-basics")

## ----load-packages, message=FALSE, warning=FALSE------------------------------
library("phyloseq")
library("ggplot2")

## ----ggplot2-themes-----------------------------------------------------------
theme_set(theme_bw())

## -----------------------------------------------------------------------------
data(GlobalPatterns)

## -----------------------------------------------------------------------------
# prune OTUs that are not present in at least one sample
GP <- prune_taxa(taxa_sums(GlobalPatterns) > 0, GlobalPatterns)
# Define a human-associated versus non-human categorical variable:
human <- get_variable(GP, "SampleType") %in% c("Feces", "Mock", "Skin", "Tongue")
# Add new human variable to sample data:
sample_data(GP)$human <- factor(human)

## ----richness_estimates0, fig.width=13, fig.height=7--------------------------
alpha_meas = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson")
(p <- plot_richness(GP, "human", "SampleType", measures=alpha_meas))

## ----richness_estimates, fig.width=13,height=7--------------------------------
p + geom_boxplot(data=p$data, aes(x=human, y=value, color=NULL), alpha=0.1)

## -----------------------------------------------------------------------------
GP.chl <- subset_taxa(GP, Phylum=="Chlamydiae")

## ----GP-chl-tree, fig.width=15, fig.height=7, message=FALSE, warning=FALSE----
plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="Abundance")

## -----------------------------------------------------------------------------
data(enterotype)

## ----EntAbundPlot, fig.height=6, fig.width=8----------------------------------
par(mar = c(10, 4, 4, 2) + 0.1) # make more room on bottom margin
N <- 30
barplot(sort(taxa_sums(enterotype), TRUE)[1:N]/nsamples(enterotype), las=2)

## -----------------------------------------------------------------------------
rank_names(enterotype)

## -----------------------------------------------------------------------------
TopNOTUs <- names(sort(taxa_sums(enterotype), TRUE)[1:10]) 
ent10   <- prune_taxa(TopNOTUs, enterotype)
print(ent10)

## -----------------------------------------------------------------------------
sample_variables(ent10)

## ----entbarplot0, fig.height=6, fig.width=10----------------------------------
plot_bar(ent10, "SeqTech", fill="Enterotype", facet_grid=~Genus)

## ----GPheatmap----------------------------------------------------------------
data("GlobalPatterns")
gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota")
(p <- plot_heatmap(gpac, "NMDS", "bray", "SampleType", "Family"))

## ----GPheatmap-rename-axes----------------------------------------------------
p$scales$scales[[1]]$name <- "My X-Axis"
p$scales$scales[[2]]$name <- "My Y-Axis"
print(p)

## ----plot_sample_network, fig.width=11, fig.height=7, message=FALSE, warning=FALSE----
data(enterotype)
plot_net(enterotype, maxdist=0.4, color="SeqTech", shape="Enterotype")

## ----eval=FALSE---------------------------------------------------------------
#  my.physeq <- import("Biom", BIOMfilename="myBiomFile.biom")
#  my.ord    <- ordinate(my.physeq)
#  plot_ordination(my.physeq, my.ord, color="myFavoriteVarible")

## ----help-import, eval=FALSE--------------------------------------------------
#  help(import)
#  help(ordinate)
#  help(distance)
#  help(plot_ordination)

## ----GP-data-load-------------------------------------------------------------
data(GlobalPatterns)

## ---- eval=FALSE--------------------------------------------------------------
#  GPUF <- UniFrac(GlobalPatterns)

## ----load-precomputed-UF------------------------------------------------------
load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))

## -----------------------------------------------------------------------------
GloPa.pcoa = ordinate(GlobalPatterns, method="PCoA", distance=GPUF)

## ----PCoAScree, fig.width=6, fig.height=4-------------------------------------
plot_scree(GloPa.pcoa, "Scree plot for Global Patterns, UniFrac/PCoA")

## ----GPfig5ax1213-------------------------------------------------------------
(p12 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", color="SampleType") + 
  geom_point(size=5) + geom_path() + scale_colour_hue(guide = FALSE) )
(p13 <- plot_ordination(GlobalPatterns, GloPa.pcoa, "samples", axes=c(1, 3),
  color="SampleType") + geom_line() + geom_point(size=5) )

## ----GP_UF_NMDS0--------------------------------------------------------------
# (Re)load UniFrac distance matrix and GlobalPatterns data
data(GlobalPatterns)
load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
# perform NMDS, set to 2 axes
GP.NMDS <- ordinate(GlobalPatterns, "NMDS", GPUF)
(p <- plot_ordination(GlobalPatterns, GP.NMDS, "samples", color="SampleType") +
  geom_line() + geom_point(size=5) )

## ----GPCAscree0, fig=FALSE----------------------------------------------------
data(GlobalPatterns)
# Take a subset of the GP dataset, top 200 species
topsp <- names(sort(taxa_sums(GlobalPatterns), TRUE)[1:200])
GP    <- prune_taxa(topsp, GlobalPatterns)
# Subset further to top 5 phyla, among the top 200 OTUs.
top5ph <- sort(tapply(taxa_sums(GP), tax_table(GP)[, "Phylum"], sum), decreasing=TRUE)[1:5]
GP     <- subset_taxa(GP, Phylum %in% names(top5ph))
# Re-add human variable to sample data:
sample_data(GP)$human <- factor(human)

## ----GPCAscree, fig.width=8, fig.height=5-------------------------------------
# Now perform a unconstrained correspondence analysis
gpca  <- ordinate(GP, "CCA")
# Scree plot
plot_scree(gpca, "Scree Plot for Global Patterns Correspondence Analysis")

## ----GPCA1234-----------------------------------------------------------------
(p12 <- plot_ordination(GP, gpca, "samples", color="SampleType") + 
  geom_line() + geom_point(size=5) )
(p34 <- plot_ordination(GP, gpca, "samples", axes=c(3, 4), color="SampleType") + 
  geom_line() + geom_point(size=5) )

## ----GPCAspecplot0------------------------------------------------------------
p1  <- plot_ordination(GP, gpca, "species", color="Phylum")
(p1 <- ggplot(p1$data, p1$mapping) + geom_point(size=5, alpha=0.5) + 
  facet_wrap(~Phylum) +  scale_colour_hue(guide = FALSE) )

## ----GPCAspecplotTopo0--------------------------------------------------------
(p3 <- ggplot(p1$data, p1$mapping) + geom_density2d() +
  facet_wrap(~Phylum) +  scale_colour_hue(guide = FALSE) )

## ----GPCAjitter0--------------------------------------------------------------
library("reshape2")
# Melt the species-data.frame, DF, to facet each CA axis separately
mdf <- melt(p1$data[, c("CA1", "CA2", "Phylum", "Family", "Genus")], 
            id=c("Phylum", "Family", "Genus") )
# Select some special outliers for labelling
LF <- subset(mdf, variable=="CA2" & value < -1.0)
# build plot: boxplot summaries of each CA-axis, with labels
p <- ggplot(mdf, aes(Phylum, value, color=Phylum)) + 
  geom_boxplot() + 
  facet_wrap(~variable, 2) + 
  scale_colour_hue(guide = FALSE) +
  theme_bw() + 
  theme( axis.text.x = element_text(angle = -90, vjust = 0.5) )
# Add the text label layer, and render ggplot graphic
(p <- p + geom_text(data=subset(LF, !is.na(Family)),
  mapping = aes(Phylum, value+0.1, color=Phylum, label=Family), 
  vjust=0,
  size=2))

## ----GPtaxaplot0--------------------------------------------------------------
plot_bar(GP, x="human", fill="SampleType", facet_grid= ~ Phylum)

## ----GPdpcoa01----------------------------------------------------------------
# Perform ordination
GP.dpcoa <- ordinate(GP, "DPCoA") 
# Generate default ordination bi-plot
pdpcoa <- 
  plot_ordination(
    physeq = GP, 
    ordination = GP.dpcoa, 
    type="biplot",
    color="SampleType", 
    shape="Phylum")
# Adjust the shape scale manually 
# to make taxa hollow and samples filled (advanced)
shape.fac <- pdpcoa$data$Phylum
man.shapes <- c(19, 21:25)
names(man.shapes) <- c("Samples", levels(shape.fac)[levels(shape.fac)!="Samples"])
p2dpcoa <- pdpcoa + scale_shape_manual(values=man.shapes)
p2dpcoa

## ----GPdpcoa02----------------------------------------------------------------
# Show just Samples or just Taxa
plot_ordination(GP, GP.dpcoa, type="taxa", shape="Phylum")
plot_ordination(GP, GP.dpcoa, type="samples", color="SampleType")
# Split
plot_ordination(GP, GP.dpcoa, type="split",
                color="SampleType", shape="Phylum") +
  ggplot2::scale_colour_discrete()

## ----distancefun--------------------------------------------------------------
data(esophagus)
distance(esophagus, "bray") 
distance(esophagus, "wunifrac") # weighted UniFrac
distance(esophagus, "jaccard") # vegdist jaccard
distance(esophagus, "g") # betadiver method option "g"

## ----eval=FALSE, echo=TRUE----------------------------------------------------
#  data(esophagus)
#  distance(esophagus, "wUniFrac")
#  distance(esophagus, "uUniFrac")

## -----------------------------------------------------------------------------
# (Re)load UniFrac distance matrix and GlobalPatterns data
data(GlobalPatterns)
load(system.file("doc", "Unweighted_UniFrac.RData", package="phyloseq"))
# Manually define color-shading vector based on sample type.
colorScale    <- rainbow(length(levels(get_variable(GlobalPatterns, "SampleType"))))
cols          <- colorScale[get_variable(GlobalPatterns, "SampleType")] 
GP.tip.labels <- as(get_variable(GlobalPatterns, "SampleType"), "character")
# This is the actual hierarchical clustering call, specifying average-link clustering
GP.hclust     <- hclust(GPUF, method="average")
plot(GP.hclust, col=cols)