## ----style, echo = FALSE, results = 'asis'------------------------------------ BiocStyle::markdown() ## ----global_options, include=FALSE------------------------------------------------------------------------------------------------- knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, fig.width=8, fig.height=8) options(width=133) ## ----eval=FALSE-------------------------------------------------------------------------------------------------------------------- # ## Do not execute if you have already installed BioMM. # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # The following initializes usage of Bioc devel # BiocManager::install(version='devel') # BiocManager::install("BioMM") ## ----eval=FALSE-------------------------------------------------------------------------------------------------------------------- # install.packages("devtools") # library("devtools") # install_github("transbioZI/BioMM", build_vignettes=TRUE) ## ----loadPkg, eval=TRUE, results="hide"-------------------------------------------------------------------------------------------- library(BioMM) library(BiocParallel) library(ranger) library(rms) library(glmnet) library(e1071) library(precrec) library(vioplot) library(CMplot) library(imager) library(topGO) library(xlsx) ## ----studyData, eval=TRUE---------------------------------------------------------------------------------------------------------- ## DNA methylation data methylData <- readRDS(system.file("extdata", "/methylData.rds", package="BioMM")) # The first column is the label, and the rest are the features (e.g., CpGs) head(methylData[,1:4]) # 0: control and 1: patient table(methylData[,1]) dim(methylData) ## Gene expression data expData <- readRDS(system.file("extdata", "/expData.rds", package="BioMM")) # The first column is the label, and the rest are the features (e.g., genes) head(expData[,1:4]) # 0: control and 1: patient table(expData[,1]) dim(expData) ## ----annotationFile, eval=TRUE----------------------------------------------------------------------------------------------------- ## Load feature annotation data featureAnno <- readRDS(system.file("extdata", "cpgAnno.rds", package="BioMM")) # The mapping between CpGs and genes (i.e. entrezID or gene symbol) head(featureAnno) # total number of CpGs under investigation str(unique(featureAnno[,1])) ## Reprocessed Gene ontological pathway annotation with 10 and 200 genes for each pathway golist <- readRDS(system.file("extdata", "goDB.rds", package="BioMM")) ## Number of investigated biological processes length(golist) str(golist[1:3]) ## Reprocessed KEGG pathway annotation with 10 and 200 genes for each pathway kegglist <- readRDS(system.file("extdata", "keggDB.rds", package="BioMM")) ## Number of investigated KEGG pathways length(kegglist) str(kegglist[1:3]) ## ----pathlist, eval=TRUE----------------------------------------------------------------------------------------------------------- ## Feature annotation to pathways ## Use 100 pathways to reduce the runtime for the downstream analysis. But if possible, please make sure to use all. numPath <- 100 # GO pathway mapping using DNA methylation data golistSub <- golist[seq_len(numPath)] methylGOlist <- omics2pathlist(data=methylData, pathlistDB=golistSub, featureAnno=featureAnno, restrictUp=200, restrictDown=10, minPathSize=10) # KEGG pathway mapping using DNA methylation data kegglistSub <- kegglist[seq_len(numPath)] methylKEGGlist <- omics2pathlist(data=methylData, pathlistDB=kegglistSub, featureAnno=featureAnno, restrictUp=200, restrictDown=10, minPathSize=10) # GO pathway mapping using gene expression data golistSub <- golist[seq_len(numPath)] expGOlist <- omics2pathlist(data=expData, pathlistDB=golistSub, featureAnno=NULL, restrictUp=200, restrictDown=10, minPathSize=10) # KEGG pathway mapping using gene expression data kegglistSub <- kegglist[seq_len(numPath)] expKEGGlist <- omics2pathlist(data=expData, pathlistDB=kegglistSub, featureAnno=NULL, restrictUp=200, restrictDown=10, minPathSize=10) ## ----BioMMrandForest4methylGO, eval=TRUE------------------------------------------------------------------------------------------- ## To reduce the runtime, only use a subset of DNA methylation data ## However, if possible, subsetting the data is not suggested. trainData <- methylData[,1:3000] trainDataY <- trainData[,1] testData <- NULL ## Model parameters supervisedStage1=TRUE classifier <- "randForest" predMode <- "probability" paramlist <- list(ntree=100, nthreads=20) core <- MulticoreParam(workers = 10) set.seed(123) result <- BioMM(trainData=trainData, testData=NULL, pathlistDB=golistSub, featureAnno, restrictUp=200, restrictDown=10, minPathSize=10, supervisedStage1, typePCA="regular", resample1="BS", resample2="CV", dataMode="allTrain", repeatA1=50, repeatA2=1, repeatB1=10, repeatB2=1, nfolds=10, FSmethod1=NULL, FSmethod2=NULL, cutP1=0.05, cutP2=0.05, fdr2=NULL, FScore=MulticoreParam(workers = 1), classifier, predMode, paramlist, innerCore=core) if (is.null(testData)) { metricCV <- getMetrics(dataY = trainDataY, predY = result) message("Cross-validation prediction performance:") print(metricCV) } else if (!is.null(testData)){ testDataY <- testData[,1] metricCV <- getMetrics(dataY = trainDataY, cvYscore = result[[1]]) metricTest <- getMetrics(dataY = testDataY, testYscore = result[[2]]) message("Cross-validation performance:") print(metricCV) message("Test set prediction performance:") print(metricTest) } ## ----BioMMrandForest4expKEGG, eval=TRUE-------------------------------------------------------------------------------------------- ## to reduce the runtime, only use a subset of gene expression data ## However, if possible, subsetting the data is not suggested. trainData <- expData[,1:3000] trainDataY <- trainData[,1] testData <- NULL ## Only for gene expression data featureAnno=NULL ## Model parameters supervisedStage1=TRUE classifier <- "randForest" predMode <- "probability" paramlist <- list(ntree=100, nthreads=20) core <- MulticoreParam(workers = 10) set.seed(123) result <- BioMM(trainData=trainData, testData=NULL, pathlistDB=kegglistSub, featureAnno, restrictUp=200, restrictDown=10, minPathSize=10, supervisedStage1, typePCA="regular", resample1="BS", resample2="CV", dataMode="allTrain", repeatA1=50, repeatA2=1, repeatB1=10, repeatB2=1, nfolds=10, FSmethod1=NULL, FSmethod2=NULL, cutP1=0.05, cutP2=0.05, fdr2=NULL, FScore=MulticoreParam(workers = 1), classifier, predMode, paramlist, innerCore=core) ## Cross-validation is applied on the training data, therefore 'result' only returns the CV predicted score. metricCV <- getMetrics(dataY = trainDataY, predY = result) message("Cross-validation prediction performance:") print(metricCV) ## ----stage2dataAprep, eval=TRUE---------------------------------------------------------------------------------------------------- ## Define the omics type # omicsType <- "methylation" omicsType <- "expression" pathType <- "GO" pathType <- "KEGG" if (omicsType == "methylation" & pathType == "GO"){ studylist <- methylGOlist } else if (omicsType == "methylation" & pathType == "KEGG"){ studylist <- methylKEGGlist } else if (omicsType == "expression" & pathType == "GO"){ studylist <- expGOlist } else if (omicsType == "expression" & pathType == "KEGG"){ studylist <- expKEGGlist } else { stop("Wrong specified omicsType and pathType!") } length(studylist) ## Model parameters classifier <- "randForest" predMode <- "probability" paramlist <- list(ntree=100, nthreads=20) core <- MulticoreParam(workers = 10) set.seed(123) stage2dataA <- reconBySupervised(trainDataList=studylist, testDataList=NULL, resample="BS", dataMode="allTrain", repeatA=50, repeatB=1, nfolds=10, FSmethod=NULL, cutP=0.05, fdr=NULL, FScore=MulticoreParam(workers = 1), classifier, predMode, paramlist, innerCore=core, outFileA=NULL, outFileB=NULL) ## Check stage-2 data dim(stage2dataA) print(table(stage2dataA[,1])) head(stage2dataA[,1:4]) ## ----stage2dataViz, eval=TRUE------------------------------------------------------------------------------------------------------ core <- MulticoreParam(workers = 10) fileName <- paste0(omicsType,"_", pathType, "_featuresVarExplained.png") plotVarExplained(data=stage2dataA, posF=TRUE, binarize=FALSE, core=core, pathTitle=paste0(pathType, " pathways"), fileName) plot(load.image(fileName)) ## ----topPathFeatures, eval=TRUE, fig.show="hold"----------------------------------------------------------------------------------- core <- MulticoreParam(workers = 1) rankMetric <- "negPlogit" filePrefix <- paste0(omicsType, "_", pathType, "_topPath_", rankMetric) topPath <- plotRankedFeature(data=stage2dataA, posF=TRUE, topF=10, binarize=FALSE, blocklist=studylist, rankMetric=rankMetric, colorMetric="size", core, pathTitle=paste0(pathType, " pathways"), fileName=paste0(filePrefix, ".png")) plot(load.image(paste0(filePrefix, ".png"))) ## ----reportTopPath, eval=TRUE------------------------------------------------------------------------------------------------------ ## Report the top pathways if (pathType == "GO"){ goterms = unlist(Term(GOTERM)) topGOID = gsub("\\.", ":", rownames(topPath)) subTerm = goterms[is.element(names(goterms), topGOID)] topIDmatch = subTerm[match(topGOID, names(subTerm))] topPath <- data.frame(topPath, Description=topIDmatch) } else if (pathType == "KEGG"){ ## A matching list between KEGG ID and names. Data freezes on Aug 2020 keggID2name <- readRDS(system.file("extdata", "/keggID2name202008.rds", package="BioMM")) keggSub <- keggID2name[is.element(keggID2name[,"ID"], rownames(topPath)),] topIDmatch <- keggSub[match(rownames(topPath), keggSub[,"ID"]),] topPath <- data.frame(topPath, Description=topIDmatch[,"name"]) } print(topPath) write.xlsx(topPath,file=paste0(filePrefix, ".xlsx")) # write.table(topPath,file=paste0(filePrefix, ".txt"), sep="\t") ## ----cirPlot, eval=TRUE, fig.show="hold"------------------------------------------------------------------------------------------- core <- MulticoreParam(workers = 10) pathID <- gsub("\\.", ":", rownames(topPath)) ## The number of top pathways must be bigger than overall investigated pathways pathSet <- studylist[is.element(names(studylist), pathID)] pathMatch <- pathSet[match(pathID, names(pathSet))] fileName <- paste0(omicsType, "_", pathType, "_SigRankBy_", rankMetric) cirPlot4pathway(datalist=pathMatch, topPathID=names(pathMatch), core, fileName) ## ----sessioninfo, eval=TRUE-------------------------------------------------------------------------------------------------------- sessionInfo()