## ----getPackage, eval=FALSE--------------------------------------------------- # # if (!requireNamespace('BiocManager', quietly = TRUE)) # install.packages('BiocManager') # # BiocManager::install('PCAtools') # ## ----getPackageDevel, eval=FALSE---------------------------------------------- # # devtools::install_github('kevinblighe/PCAtools') # ## ----Load, message=FALSE------------------------------------------------------ library(PCAtools) ## ---- message = FALSE--------------------------------------------------------- library(Biobase) library(GEOquery) # load series and platform data from GEO gset <- getGEO('GSE2990', GSEMatrix = TRUE, getGPL = FALSE) x <- exprs(gset[[1]]) # remove Affymetrix control probes x <- x[-grep('^AFFX', rownames(x)),] # extract information of interest from the phenotype data (pdata) idx <- which(colnames(pData(gset[[1]])) %in% c('age:ch1', 'distant rfs:ch1', 'er:ch1', 'ggi:ch1', 'grade:ch1', 'size:ch1', 'time rfs:ch1')) metadata <- data.frame(pData(gset[[1]])[,idx], row.names = rownames(pData(gset[[1]]))) # tidy column names colnames(metadata) <- c('Age', 'Distant.RFS', 'ER', 'GGI', 'Grade', 'Size', 'Time.RFS') # prepare certain phenotypes metadata$Age <- as.numeric(gsub('^KJ', NA, metadata$Age)) metadata$Distant.RFS <- factor(metadata$Distant.RFS, levels=c(0,1)) metadata$ER <- factor(gsub('\\?', NA, metadata$ER), levels=c(0,1)) metadata$ER <- factor(ifelse(metadata$ER == 1, 'ER+', 'ER-'), levels = c('ER-', 'ER+')) metadata$GGI <- as.numeric(metadata$GGI) metadata$Grade <- factor(gsub('\\?', NA, metadata$Grade), levels=c(1,2,3)) metadata$Grade <- gsub(1, 'Grade 1', gsub(2, 'Grade 2', gsub(3, 'Grade 3', metadata$Grade))) metadata$Grade <- factor(metadata$Grade, levels = c('Grade 1', 'Grade 2', 'Grade 3')) metadata$Size <- as.numeric(metadata$Size) metadata$Time.RFS <- as.numeric(gsub('^KJX|^KJ', NA, metadata$Time.RFS)) # remove samples from the pdata that have any NA value discard <- apply(metadata, 1, function(x) any(is.na(x))) metadata <- metadata[!discard,] # filter the expression data to match the samples in our pdata x <- x[,which(colnames(x) %in% rownames(metadata))] # check that sample names match exactly between pdata and expression data all(colnames(x) == rownames(metadata)) ## ----------------------------------------------------------------------------- p <- pca(x, metadata = metadata, removeVar = 0.1) ## ----ex1, warning = FALSE, fig.height = 7, fig.width = 18, fig.cap = 'Figure 1: A scree plot to show the proportion of explained variance by PC'---- screeplot(p) ## ----ex2, fig.height = 7, fig.width = 8, fig.cap = 'Figure 2: A bi-plot of PC1 versus PC2'---- biplot(p) ## ----ex3, message = FALSE, fig.height = 10, fig.width = 10, fig.cap = 'Figure 3: A pairs plot, comparing PC1 - PC5 on a pairwise basis'---- pairsplot(p) ## ----ex4, fig.height = 6, fig.width = 8, fig.cap = 'Figure 4: Plot the component loadings and label genes most responsible for variation'---- plotloadings(p) ## ----ex5, warning = FALSE, fig.height = 4, fig.width = 8, fig.cap = 'Figure 5: Correlate PCs to metadata variables'---- eigencorplot(p, metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS')) ## ---- warning = FALSE--------------------------------------------------------- horn <- parallelPCA(x) horn$n ## ----------------------------------------------------------------------------- elbow <- findElbowPoint(p$variance) elbow ## ----ex6, fig.height = 7, fig.width = 9, fig.cap = 'Figure 6: Advanced scree plot illustrating optimum number of PCs'---- library(ggplot2) screeplot(p, components = getComponents(p, 1:20), vline = c(horn$n, elbow)) + geom_text(aes(horn$n + 1, 50, label = "Horn's", vjust = -1)) + geom_text(aes(elbow + 1, 50, label = "Elbow", vjust = -1)) ## ----ex7, fig.height = 6, fig.width = 8, fig.cap = 'Figure 7: adding lines and a legend to a bi-plot'---- biplot(p, lab = paste0(p$metadata$Age, 'yo'), colby = 'ER', hline = 0, vline = 0, legendPosition = 'right') ## ----ex8, message = FALSE, fig.height = 7, fig.width = 7, fig.cap = 'Figure 8: supplying custom colours to a bi-plot'---- biplot(p, colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'), hline = 0, vline = c(-25, 0, 25), legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0) ## ----ex9, message = FALSE, fig.height = 7, fig.width = 7, fig.cap = 'Figure 9: supplying custom colours and shapes to a bi-plot, removing connectors, and modifying titles'---- biplot(p, colby = 'ER', colkey = c('ER+'='forestgreen', 'ER-'='purple'), hline = 0, vline = c(-25, 0, 25), legendPosition = 'top', legendLabSize = 16, legendIconSize = 8.0, shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), drawConnectors = FALSE, title = 'PCA bi-plot', subtitle = 'PC1 versus PC2', caption = '27 PCs == 80%') ## ----ex10, message = FALSE, fig.height = 6, fig.width = 8, fig.cap = 'Figure 10: removing labels, modifying line types, removing gridlines, and increasing point size in a bi-plot'---- biplot(p, lab = NULL, colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'), hline = 0, vline = c(-25, 0, 25), vlineType = c('dotdash', 'solid', 'dashed'), gridlines.major = FALSE, gridlines.minor = FALSE, pointSize = 5, legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0, shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), drawConnectors = FALSE, title = 'PCA bi-plot', subtitle = 'PC1 versus PC2', caption = '27 PCs == 80%') ## ----ex11, fig.height = 6, fig.width = 8, fig.cap = 'Figure 11: colouring by a continuous variable and plotting other PCs in a bi-plot'---- biplot(p, x = 'PC10', y = 'PC50', lab = NULL, colby = 'Age', hline = 0, vline = 0, hlineWidth = 1.0, vlineWidth = 1.0, gridlines.major = FALSE, gridlines.minor = TRUE, pointSize = 5, legendPosition = 'left', legendLabSize = 16, legendIconSize = 8.0, shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), drawConnectors = FALSE, title = 'PCA bi-plot', subtitle = 'PC10 versus PC50', caption = '27 PCs == 80%') ## ----ex12, message = FALSE, fig.height = 8, fig.width = 7, fig.cap = 'Figure 12: maximising available space in a pairs plot'---- pairsplot(p, components = getComponents(p, c(1:10)), triangle = TRUE, trianglelabSize = 12, hline = 0, vline = 0, pointSize = 0.4, gridlines.major = FALSE, gridlines.minor = FALSE, colby = 'Grade', title = 'Pairs plot', plotaxes = FALSE, margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm')) ## ----ex13, fig.height = 6, fig.width = 8, fig.cap = 'Figure 13: arranging a pairs plot horizontally'---- pairsplot(p, components = getComponents(p, c(4,33,11,1)), triangle = FALSE, hline = 0, vline = 0, pointSize = 0.8, gridlines.major = FALSE, gridlines.minor = FALSE, colby = 'ER', title = 'Pairs plot', plotaxes = TRUE, margingaps = unit(c(0.1, 0.1, 0.1, 0.1), 'cm')) ## ----ex14, fig.height = 6, fig.width = 9, fig.cap = 'Figure 14: modifying cut-off for labeling in a loadings plot'---- plotloadings(p, rangeRetain = 0.01, labSize = 3.0, title = 'Loadings plot', subtitle = 'PC1, PC2, PC3, PC4, PC5', caption = 'Top 1% variables', shape = 24, col = c('limegreen', 'black', 'red3'), drawConnectors = TRUE) ## ----eval = FALSE------------------------------------------------------------- # # library(biomaRt) # # mart <- useMart('ENSEMBL_MART_ENSEMBL', host = 'useast.ensembl.org') # mart <- useDataset('hsapiens_gene_ensembl', mart) # # getBM(mart = mart, # attributes = c('affy_hg_u133a', 'ensembl_gene_id', # 'gene_biotype', 'external_gene_name'), # filter = 'affy_hg_u133a', # values = c('215281_x_at', '214464_at', '211122_s_at', '205225_at', # '202037_s_at', '204540_at', '215176_x_at', '205044_at', '208650_s_at', # '205380_at'), # uniqueRows = TRUE) # ## ----ex15, fig.height = 9, fig.width = 11, fig.cap = 'Figure 15: plotting absolute component loadings'---- plotloadings(p, components = getComponents(p, c(4,33,11,1)), rangeRetain = 0.1, labSize = 3.0, absolute = FALSE, title = 'Loadings plot', subtitle = 'Misc PCs', caption = 'Top 10% variables', shape = 23, shapeSizeRange = c(1, 16), col = c('white', 'pink'), drawConnectors = FALSE) ## ----ex16, warning = FALSE, fig.height = 4, fig.width = 12, fig.cap = 'Figure 16: correlating PCs that account for at least 80% variation to clinical variables'---- eigencorplot(p, components = getComponents(p, 1:27), metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'), col = c('darkblue', 'blue2', 'black', 'red2', 'darkred'), cexCorval = 0.7, colCorval = 'white', fontCorval = 2, posLab = 'bottomleft', rotLabX = 45, posColKey = 'top', cexLabColKey = 1.5, scale = TRUE, main = 'PC1-27 clinical correlations', colFrame = 'white', plotRsquared = FALSE) ## ----ex17, warning = FALSE, fig.height = 5, fig.width = 12, fig.cap = 'Figure 17: modifying cut-offs and symbols for statistical significance in eigencorplot'---- eigencorplot(p, components = getComponents(p, 1:horn$n), metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'), col = c('white', 'cornsilk1', 'gold', 'forestgreen', 'darkgreen'), cexCorval = 1.2, fontCorval = 2, posLab = 'all', rotLabX = 45, scale = TRUE, main = bquote(Principal ~ component ~ Pearson ~ r^2 ~ clinical ~ correlates), plotRsquared = TRUE, corFUN = 'pearson', corUSE = 'pairwise.complete.obs', corMultipleTestCorrection = 'BH', signifSymbols = c('****', '***', '**', '*', ''), signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1)) ## ----ex18, message = FALSE, warning = FALSE, fig.height = 10, fig.width = 15, fig.cap = 'Figure 18: a merged panel of all PCAtools plots'---- pscree <- screeplot(p, components = getComponents(p, 1:30), hline = 80, vline = 27, axisLabSize = 10, returnPlot = FALSE) + geom_text(aes(20, 80, label = '80% explained variation', vjust = -1)) ppairs <- pairsplot(p, components = getComponents(p, c(1:3)), triangle = TRUE, trianglelabSize = 12, hline = 0, vline = 0, pointSize = 0.8, gridlines.major = FALSE, gridlines.minor = FALSE, colby = 'Grade', title = '', titleLabSize = 16, plotaxes = FALSE, margingaps = unit(c(0.01, 0.01, 0.01, 0.01), 'cm'), returnPlot = FALSE) pbiplot <- biplot(p, lab = NULL, colby = 'ER', colkey = c('ER+'='royalblue', 'ER-'='red3'), hline = 0, vline = c(-25, 0, 25), vlineType = c('dotdash', 'solid', 'dashed'), gridlines.major = FALSE, gridlines.minor = FALSE, pointSize = 2, axisLabSize = 12, legendPosition = 'left', legendLabSize = 10, legendIconSize = 3.0, shape = 'Grade', shapekey = c('Grade 1'=15, 'Grade 2'=17, 'Grade 3'=8), drawConnectors = FALSE, title = 'PCA bi-plot', subtitle = 'PC1 versus PC2', caption = '27 PCs == 80%', returnPlot = FALSE) ploadings <- plotloadings(p, rangeRetain = 0.01, labSize = 2.5, title = 'Loadings plot', axisLabSize = 12, subtitle = 'PC1, PC2, PC3, PC4, PC5', caption = 'Top 1% variables', shape = 24, shapeSizeRange = c(4, 4), col = c('limegreen', 'black', 'red3'), legendPosition = 'none', drawConnectors = FALSE, returnPlot = FALSE) peigencor <- eigencorplot(p, components = getComponents(p, 1:10), metavars = c('Age','Distant.RFS','ER','GGI','Grade','Size','Time.RFS'), #col = c('royalblue', '', 'gold', 'forestgreen', 'darkgreen'), cexCorval = 0.6, fontCorval = 2, posLab = 'all', rotLabX = 45, scale = TRUE, main = "PC clinical correlates", cexMain = 1.5, plotRsquared = FALSE, corFUN = 'pearson', corUSE = 'pairwise.complete.obs', signifSymbols = c('****', '***', '**', '*', ''), signifCutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), returnPlot = FALSE) library(cowplot) library(ggplotify) top_row <- plot_grid(pscree, ppairs, pbiplot, ncol = 3, labels = c('A', 'B Pairs plot', 'C'), label_fontfamily = 'serif', label_fontface = 'bold', label_size = 22, align = 'h', rel_widths = c(1.05, 0.9, 1.05)) bottom_row <- plot_grid(ploadings, as.grob(peigencor), ncol = 2, labels = c('D', 'E'), label_fontfamily = 'serif', label_fontface = 'bold', label_size = 22, align = 'h', rel_widths = c(1.5, 1.5)) plot_grid(top_row, bottom_row, ncol = 1, rel_heights = c(1.0, 1.0)) ## ----------------------------------------------------------------------------- sessionInfo()