## ----getPackage, eval=FALSE---------------------------------------------------
#  
#      if (!requireNamespace('BiocManager', quietly = TRUE))
#          install.packages('BiocManager')
#  
#      BiocManager::install('PCAtools')
#  

## ----getPackageDevel, eval=FALSE----------------------------------------------
#  
#      devtools::install_github('kevinblighe/PCAtools')
#  

## ----Load, message=FALSE------------------------------------------------------

    library(PCAtools)


## ---- message = FALSE---------------------------------------------------------

  library(Biobase)
  library(GEOquery)

  # load series and platform data from GEO
  gset <- getGEO('GSE2990', GSEMatrix = TRUE, getGPL = FALSE)

  x <- exprs(gset[[1]])

  # remove Affymetrix control probes
  x <- x[-grep('^AFFX', rownames(x)),]

  # extract information of interest from the phenotype data (pdata)
  idx <- which(colnames(pData(gset[[1]])) %in%
    c('age:ch1', 'distant rfs:ch1', 'er:ch1',
      'ggi:ch1', 'grade:ch1', 'size:ch1',
      'time rfs:ch1'))

  metadata <- data.frame(pData(gset[[1]])[,idx],
    row.names = rownames(pData(gset[[1]])))

  # tidy column names
  colnames(metadata) <- c('Age', 'Distant.RFS', 'ER', 'GGI', 'Grade',
    'Size', 'Time.RFS')

  # prepare certain phenotypes
  metadata$Age <- as.numeric(gsub('^KJ', NA, metadata$Age))
  metadata$Distant.RFS <- factor(metadata$Distant.RFS, levels=c(0,1))
  metadata$ER <- factor(gsub('\\?', NA, metadata$ER), levels=c(0,1))
  metadata$ER <- factor(ifelse(metadata$ER == 1, 'ER+', 'ER-'), levels = c('ER-', 'ER+'))
  metadata$GGI <- as.numeric(metadata$GGI)
  metadata$Grade <- factor(gsub('\\?', NA, metadata$Grade), levels=c(1,2,3))
  metadata$Grade <- gsub(1, 'Grade 1', gsub(2, 'Grade 2', gsub(3, 'Grade 3', metadata$Grade)))
  metadata$Grade <- factor(metadata$Grade, levels = c('Grade 1', 'Grade 2', 'Grade 3'))
  metadata$Size <- as.numeric(metadata$Size)
  metadata$Time.RFS <- as.numeric(gsub('^KJX|^KJ', NA, metadata$Time.RFS))

  # remove samples from the pdata that have any NA value
  discard <- apply(metadata, 1, function(x) any(is.na(x)))

  metadata <- metadata[!discard,]

  # filter the expression data to match the samples in our pdata
  x <- x[,which(colnames(x) %in% rownames(metadata))]

  # check that sample names match exactly between pdata and expression data 
  all(colnames(x) == rownames(metadata))


## -----------------------------------------------------------------------------

  p <- pca(x, metadata = metadata, removeVar = 0.1)


## ----ex1, warning = FALSE, fig.height = 7, fig.width = 18, fig.cap = 'Figure 1: A scree plot to show the proportion of explained variance by PC'----

  screeplot(p)


## ----ex2, fig.height = 7, fig.width = 8, fig.cap = 'Figure 2: A bi-plot of PC1 versus PC2'----

  biplot(p)


## ----ex3, message = FALSE, fig.height = 10, fig.width = 10, fig.cap = 'Figure 3: A pairs plot, comparing PC1 - PC5 on a pairwise basis'----

  pairsplot(p)


## ----ex4, fig.height = 6, fig.width = 8, fig.cap = 'Figure 4: Plot the component loadings and label genes most responsible for variation'----

  plotloadings(p)


## ----ex5, warning = FALSE, fig.height = 4, fig.width = 8, fig.cap = 'Figure 5: Correlate PCs to metadata variables'----

  eigencorplot(p,
    metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'))


## ---- warning = FALSE---------------------------------------------------------

  horn <- parallelPCA(x)
  horn$n


## -----------------------------------------------------------------------------

  elbow <- findElbowPoint(p$variance)
  elbow


## ----ex6, fig.height = 7, fig.width = 9, fig.cap = 'Figure 6: Advanced scree plot illustrating optimum number of PCs'----

  library(ggplot2)

  screeplot(p,
    components = getComponents(p, 1:20),
    vline = c(horn$n, elbow)) +
    geom_text(aes(horn$n + 1, 50, label = "Horn's", vjust = -1)) +
    geom_text(aes(elbow + 1, 50, label = "Elbow", vjust = -1))


## ----ex7, fig.height = 6, fig.width = 8, fig.cap = 'Figure 7: adding lines and a legend to a bi-plot'----

  biplot(p,
    lab = paste0(p$metadata$Age, 'yo'),
    colby = 'ER',
    hline = 0, vline = 0,
    legendPosition = 'right')


## ----ex8, message = FALSE, fig.height = 7, fig.width = 7, fig.cap = 'Figure 8: supplying custom colours to a bi-plot'----

  biplot(p,
    colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'),
    hline = 0, vline = c(-25, 0, 25),
    legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0)


## ----ex9, message = FALSE, fig.height = 7, fig.width = 7, fig.cap = 'Figure 9: supplying custom colours and shapes to a bi-plot, removing connectors, and modifying titles'----

  biplot(p,
    colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'),
    hline = 0, vline = c(-25, 0, 25),
    legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0,
    shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
    drawConnectors = FALSE,
    title = 'PCA bi-plot',
    subtitle = 'PC1 versus PC2',
    caption = '27 PCs == 80%')


## ----ex10, message = FALSE, fig.height = 6, fig.width = 8, fig.cap = 'Figure 10: removing labels, modifying line types, removing gridlines, and increasing point size in a bi-plot'----

  biplot(p,
    lab = NULL,
    colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'),
    hline = 0, vline = c(-25, 0, 25),
    vlineType = c('dotdash', 'solid', 'dashed'),
    gridlines.major = FALSE, gridlines.minor = FALSE,
    pointSize = 5,
    legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0,
    shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
    drawConnectors = FALSE,
    title = 'PCA bi-plot',
    subtitle = 'PC1 versus PC2',
    caption = '27 PCs == 80%')


## ----ex11, fig.height = 6, fig.width = 8, fig.cap = 'Figure 11: colouring by a continuous variable and plotting other PCs in a bi-plot'----

  biplot(p, x = 'PC10', y = 'PC50',
    lab = NULL,
    colby = 'Age',
    hline = 0, vline = 0,
    hlineWidth = 1.0, vlineWidth = 1.0,
    gridlines.major = FALSE, gridlines.minor = TRUE,
    pointSize = 5,
    legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0,
    shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
    drawConnectors = FALSE,
    title = 'PCA bi-plot',
    subtitle = 'PC10 versus PC50',
    caption = '27 PCs == 80%')


## ----ex12, message = FALSE, fig.height = 8, fig.width = 7, fig.cap = 'Figure 12: maximising available space in a pairs plot'----

  pairsplot(p,
    components = getComponents(p, c(1:10)),
    triangle = TRUE, trianglelabSize = 12,
    hline = 0, vline = 0,
    pointSize = 0.4,
    gridlines.major = FALSE, gridlines.minor = FALSE,
    colby = 'Grade',
    title = 'Pairs plot', plotaxes = FALSE,
    margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'))


## ----ex13, fig.height = 6, fig.width = 8, fig.cap = 'Figure 13: arranging a pairs plot horizontally'----

  pairsplot(p,
    components = getComponents(p, c(4,33,11,1)),
    triangle = FALSE,
    hline = 0, vline = 0,
    pointSize = 0.8,
    gridlines.major = FALSE, gridlines.minor = FALSE,
    colby = 'ER',
    title = 'Pairs plot', plotaxes = TRUE,
    margingaps = unit(c(0.1, 0.1, 0.1, 0.1), 'cm'))


## ----ex14, fig.height = 6, fig.width = 9, fig.cap = 'Figure 14: modifying cut-off for labeling in a loadings plot'----

  plotloadings(p,
    rangeRetain = 0.01,
    labSize = 3.0,
    title = 'Loadings plot',
    subtitle = 'PC1, PC2, PC3, PC4, PC5',
    caption = 'Top 1% variables',
    shape = 24,
    col = c('limegreen', 'black', 'red3'),
    drawConnectors = TRUE)


## ----eval = FALSE-------------------------------------------------------------
#  
#    library(biomaRt)
#  
#    mart <- useMart('ENSEMBL_MART_ENSEMBL', host = 'useast.ensembl.org')
#    mart <- useDataset('hsapiens_gene_ensembl', mart)
#  
#    getBM(mart = mart,
#      attributes = c('affy_hg_u133a', 'ensembl_gene_id',
#        'gene_biotype', 'external_gene_name'),
#      filter = 'affy_hg_u133a',
#      values = c('215281_x_at', '214464_at', '211122_s_at', '205225_at',
#        '202037_s_at', '204540_at', '215176_x_at', '205044_at', '208650_s_at',
#        '205380_at'),
#      uniqueRows = TRUE)
#  

## ----ex15, fig.height = 9, fig.width = 11, fig.cap = 'Figure 15: plotting absolute component loadings'----

  plotloadings(p,
    components = getComponents(p, c(4,33,11,1)),
    rangeRetain = 0.1,
    labSize = 3.0,
    absolute = FALSE,
    title = 'Loadings plot',
    subtitle = 'Misc PCs',
    caption = 'Top 10% variables',
    shape = 23, shapeSizeRange = c(1, 16),
    col = c('white', 'pink'),
    drawConnectors = FALSE)


## ----ex16, warning = FALSE, fig.height = 4, fig.width = 12, fig.cap = 'Figure 16: correlating PCs that account for at least 80% variation to clinical variables'----

  eigencorplot(p,
    components = getComponents(p, 1:27),
    metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'),
    col = c('darkblue', 'blue2', 'black', 'red2', 'darkred'),
    cexCorval = 0.7,
    colCorval = 'white',
    fontCorval = 2,
    posLab = 'bottomleft',
    rotLabX = 45,
    posColKey = 'top',
    cexLabColKey = 1.5,
    scale = TRUE,
    main = 'PC1-27 clinical correlations',
    colFrame = 'white',
    plotRsquared = FALSE)


## ----ex17, warning = FALSE, fig.height = 5, fig.width = 12, fig.cap = 'Figure 17: modifying cut-offs and symbols for statistical significance in eigencorplot'----

  eigencorplot(p,
    components = getComponents(p, 1:horn$n),
    metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'),
    col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'),
    cexCorval = 1.2,
    fontCorval = 2,
    posLab = 'all',
    rotLabX = 45,
    scale = TRUE,
    main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ clinical ~ correlates),
    plotRsquared = TRUE,
    corFUN = 'pearson',
    corUSE = 'pairwise.complete.obs',
    corMultipleTestCorrection = 'BH',
    signifSymbols = c('****', '***', '**', '*', ''),
    signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1))


## ----ex18, message = FALSE, warning = FALSE, fig.height = 10, fig.width = 15, fig.cap = 'Figure 18: a merged panel of all PCAtools plots'----

  pscree <- screeplot(p, components = getComponents(p, 1:30),
    hline = 80, vline = 27, axisLabSize = 10, returnPlot = FALSE) +
    geom_text(aes(20, 80, label = '80% explained variation', vjust = -1))

  ppairs <- pairsplot(p, components = getComponents(p, c(1:3)),
    triangle = TRUE, trianglelabSize = 12,
    hline = 0, vline = 0,
    pointSize = 0.8, gridlines.major = FALSE, gridlines.minor = FALSE,
    colby = 'Grade',
    title = '', titleLabSize = 16, plotaxes = FALSE,
    margingaps = unit(c(0.01, 0.01, 0.01, 0.01), 'cm'),
    returnPlot = FALSE)

  pbiplot <- biplot(p, lab = NULL,
    colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'),
    hline = 0, vline = c(-25, 0, 25), vlineType = c('dotdash', 'solid', 'dashed'),
    gridlines.major = FALSE, gridlines.minor = FALSE,
    pointSize = 2, axisLabSize = 12,
    legendPosition = 'left', legendLabSize = 10, legendIconSize = 3.0,
    shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8),
    drawConnectors = FALSE,
    title = 'PCA bi-plot', subtitle = 'PC1 versus PC2',
      caption = '27 PCs == 80%',
    returnPlot = FALSE)

  ploadings <- plotloadings(p, rangeRetain = 0.01, labSize = 2.5,
    title = 'Loadings plot', axisLabSize = 12,
    subtitle = 'PC1, PC2, PC3, PC4, PC5',
    caption = 'Top 1% variables',
    shape = 24, shapeSizeRange = c(4, 4),
    col = c('limegreen', 'black', 'red3'),
    legendPosition = 'none',
    drawConnectors = FALSE,
    returnPlot = FALSE)

  peigencor <- eigencorplot(p,
    components = getComponents(p, 1:10),
    metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'),
    #col = c('royalblue', '', 'gold', 'forestgreen', 'darkgreen'),
    cexCorval = 0.6,
    fontCorval = 2,
    posLab = 'all', 
    rotLabX = 45,
    scale = TRUE,
    main = "PC clinical correlates",
    cexMain = 1.5,
    plotRsquared = FALSE,
    corFUN = 'pearson',
    corUSE = 'pairwise.complete.obs',
    signifSymbols = c('****', '***', '**', '*', ''),
    signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1),
    returnPlot = FALSE)

    library(cowplot)
    library(ggplotify)

    top_row <- plot_grid(pscree, ppairs, pbiplot,
      ncol = 3,
      labels = c('A', 'B  Pairs plot', 'C'),
      label_fontfamily = 'serif',
      label_fontface = 'bold',
      label_size = 22,
      align = 'h',
      rel_widths = c(1.05, 0.9, 1.05))

    bottom_row <- plot_grid(ploadings,
      as.grob(peigencor),
      ncol = 2,
      labels = c('D', 'E'),
      label_fontfamily = 'serif',
      label_fontface = 'bold',
      label_size = 22,
      align = 'h',
      rel_widths = c(1.5, 1.5))

    plot_grid(top_row, bottom_row, ncol = 1, rel_heights = c(1.0, 1.0))


## -----------------------------------------------------------------------------

sessionInfo()