netDx 1.4.3
This code block is not evaluated. Need a breakdown? Look at the following sections.
suppressWarnings(suppressMessages(require(netDx)))
suppressMessages(require(curatedTCGAData))
# fetch data for example
brca <- suppressMessages(
curatedTCGAData("BRCA",c("mRNAArray"),FALSE, version="1.1.38"))
# reformat for this example
staget <- sub("[abcd]","",
sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget
pam50 <- colData(brca)$PAM50.mRNA
# binarize class
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50
tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like",
"Luminal B","HER2-enriched")),
which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]
# remove duplicate assays mapped to the same sample
smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])
pam50 <- colData(brca)$pam_mod
pheno <- colData(brca)
# create holdout set
set.seed(123) # make reproducible
idx_holdout <- c(
sample(which(pam50 == "LumA"),10,F),
sample(which(pam50 == "notLumA"),10,F)
)
holdout <- brca[,rownames(pheno)[idx_holdout]]
colData(holdout)$ID <- as.character(colData(holdout)$patientID)
colData(holdout)$STATUS <- colData(holdout)$pam_mod
# remove holdout samples from training set
tokeep <- setdiff(1:length(pam50),idx_holdout)
brca <- brca[,rownames(pheno)[tokeep]]
pID <- as.character(colData(brca)$patientID)
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod
# feature design
# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
groupList <- list()
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
age="patient.age_at_initial_pathologic_diagnosis",
stage="STAGE"
)
# define function to create nets
makeNets <- function(dataList, groupList, netDir, ...) {
netList <- c() # initialize before is.null() check
# make RNA nets (NOTE: the check for is.null() is important!)
# (Pearson correlation)
if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) {
netList <- makePSN_NamedMatrix(
dataList[["BRCA_mRNAArray-20160128"]],
rownames(dataList[["BRCA_mRNAArray-20160128"]]),
groupList[["BRCA_mRNAArray-20160128"]],
netDir, verbose = FALSE,
writeProfiles = TRUE, runSerially=TRUE, ...)
}
# make clinical nets (normalized difference)
netList2 <- c()
if (!is.null(groupList[["clinical"]])) {
netList2 <- makePSN_NamedMatrix(dataList$clinical,
rownames(dataList$clinical),
groupList[["clinical"]], netDir,
simMetric = "custom",
customFunc = normDiff, # custom function
writeProfiles = FALSE,
sparsify = TRUE, verbose = TRUE, runSerially=TRUE, ...)
}
netList <- c(unlist(netList), unlist(netList2))
return(netList)
}
# run predictor
set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
"pred_output",sep=getFileSep())
if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
numSplits <- 2L
out <- suppressMessages(
buildPredictor(
dataList=brca,groupList=groupList,
makeNetFunc=makeNets,
outDir=outDir,
numSplits=numSplits,
featScoreMax=2L,
featSelCutoff=1L,
numCores=1L,debugMode=FALSE,
logging="none")
)
# compile consistently high-scoring functions
featScores <- list() # feature scores per class
st <- unique(colData(brca)$STATUS)
for (k in 1:numSplits) {
# feature scores
for (cur in unique(st)) {
if (is.null(featScores[[cur]])) featScores[[cur]] <- list()
tmp <- out[[sprintf("Split%i",k)]]
tmp2 <- tmp[["featureScores"]][[cur]]
colnames(tmp2) <- c("PATHWAY_NAME","SCORE")
featScores[[cur]][[sprintf("Split%i",k)]] <- tmp2
}
}
featScores2 <- lapply(featScores, getNetConsensus)
featSelNet <- lapply(featScores2, function(x) {
callFeatSel(x, fsCutoff=1, fsPctPass=0)
})
# predict performance on holdout set
outDir <- paste(tempdir(), randAlphanumString(),
sep = getFileSep())
if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
dir.create(outDir)
predModel <- suppressMessages(
predict(brca, holdout, groupList,
featSelNet, makeNets,
outDir, verbose = FALSE)
)
# compute performance for prediction
perf <- getPerformance(predModel, c("LumA", "notLumA"))
message(sprintf("AUROC=%1.2f", perf$auroc))
message(sprintf("AUPR=%1.2f", perf$aupr))
message(sprintf("Accuracy=%1.1f%%", perf$acc))
# plot ROC and PR curve
plotPerf_multi(list(perf$rocCurve),
plotTitle = sprintf(
"BRCA Validation: %i samples",
nrow(colData(holdout))))
plotPerf_multi(list(perf$prCurve),
plotType = "PR",
plotTitle = sprintf(
"BRCA Validation: %i samples",
nrow(colData(holdout))))
Validating a trained model on an independent new dataset is important to ensure that the model generalizes on new samples and has real-world value. In netDx, models are trained using buildPredictor()
, which also scores features based on their predictive value for various patient labels. Therafter, we use the predict()
function to classify samples held out from the original training. You can just as easily use samples from an independent dataset.
As with the earlier vignette, we will use the application of classifying breast tumour profiles as either “Luminal A” or “other” subtype, by combining clinical and transcriptomic data.
In this example we will separate out 20 samples of the data (10 per class) to validate our trained model; those samples will not be used to train the model. We start by fetching the multi-omic dataset using the curatedTCGAData
package, performing some data formatting, and then finally separating the holdout set (here, holdout
).
suppressWarnings(suppressMessages(require(netDx)))
suppressMessages(require(curatedTCGAData))
brca <- suppressMessages(
curatedTCGAData("BRCA",c("mRNAArray"),FALSE, version="1.1.38"))
staget <- sub("[abcd]","",
sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget
pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50
tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like",
"Luminal B","HER2-enriched")),
which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]
# remove duplicate assays mapped to the same sample
smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])
## harmonizing input:
## removing 44 sampleMap rows with 'colname' not in colnames of experiments
pam50 <- colData(brca)$pam_mod
pheno <- colData(brca)
We next set 20 samples aside for independent validation. In practice, this sample size may be larger (e.g. 30% of the samples are held out), or an independent dataset may be used for the same.
set.seed(123) # make reproducible
idx_holdout <- c(
sample(which(pam50 == "LumA"),10,F),
sample(which(pam50 == "notLumA"),10,F)
)
holdout <- brca[,rownames(pheno)[idx_holdout]]
colData(holdout)$ID <- as.character(colData(holdout)$patientID)
colData(holdout)$STATUS <- colData(holdout)$pam_mod
tokeep <- setdiff(1:length(pam50),idx_holdout)
brca <- brca[,rownames(pheno)[tokeep]]
pID <- as.character(colData(brca)$patientID)
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod
The rest of the process for data setup and running the model is identical to that shown in previous vignettes.
Create feature groupings:
# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
## ---------------------------------------
## Fetching http://download.baderlab.org/EM_Genesets/January_01_2018/Human/symbol/Human_AllPathways_January_01_2018_symbol.gmt
## File: 3aeb7097bb288_Human_AllPathways_January_01_2018_symbol.gmt
## Read 3028 pathways in total, internal list has 3009 entries
## FILTER: sets with num genes in [10, 200]
## => 971 pathways excluded
## => 2038 left
groupList <- list()
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
age="patient.age_at_initial_pathologic_diagnosis",
stage="STAGE"
)
Define the function to convert features into patient similarity networks:
makeNets <- function(dataList, groupList, netDir, ...) {
netList <- c() # initialize before is.null() check
# make RNA nets (NOTE: the check for is.null() is important!)
# (Pearson correlation)
if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) {
netList <- makePSN_NamedMatrix(
dataList[["BRCA_mRNAArray-20160128"]],
rownames(dataList[["BRCA_mRNAArray-20160128"]]),
groupList[["BRCA_mRNAArray-20160128"]],
netDir, verbose = FALSE,
writeProfiles = TRUE, runSerially=TRUE, ...)
}
# make clinical nets (normalized difference)
netList2 <- c()
if (!is.null(groupList[["clinical"]])) {
netList2 <- makePSN_NamedMatrix(dataList$clinical,
rownames(dataList$clinical),
groupList[["clinical"]], netDir,
simMetric = "custom",
customFunc = normDiff, # custom function
writeProfiles = FALSE,
sparsify = TRUE, verbose = TRUE, runSerially=TRUE, ...)
}
netList <- c(unlist(netList), unlist(netList2))
return(netList)
}
Train the model:
set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
"pred_output",sep=getFileSep())
if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
numSplits <- 2L
out <- suppressMessages(
buildPredictor(
dataList=brca,groupList=groupList,
makeNetFunc=makeNets,
outDir=outDir,
numSplits=numSplits,
featScoreMax=2L,
featSelCutoff=1L,
numCores=1L,debugMode=FALSE,
logging="none")
)
## Time difference of 0.08270502 secs
## Time difference of 0.07807302 secs
## Time difference of 0.09972167 secs
## Time difference of 1.70696 secs
## Time difference of 0.07945251 secs
## Time difference of 0.07574821 secs
## Time difference of 0.112453 secs
## Time difference of 0.09866142 secs
## Time difference of 0.09857178 secs
Compile feature scores:
featScores <- list() # feature scores per class
st <- unique(colData(brca)$STATUS)
for (k in 1:numSplits) {
# feature scores
for (cur in unique(st)) {
if (is.null(featScores[[cur]])) featScores[[cur]] <- list()
tmp <- out[[sprintf("Split%i",k)]]
tmp2 <- tmp[["featureScores"]][[cur]]
colnames(tmp2) <- c("PATHWAY_NAME","SCORE")
featScores[[cur]][[sprintf("Split%i",k)]] <- tmp2
}
}
# compile scores across runs
featScores2 <- lapply(featScores, getNetConsensus)
Identify consistently high-scoring features for each class. Here, we select any feature with a score of one or more (fsCutoff=1
), in at least one split (fsPctPass=0
).
In practice you should run a higher number of splits to ensure resampling helps the predictor generalize, and change the other metrics accordingly. For example, scoring features out of 10 over 100 train/test splits, and selecting features that score 7 or higher in 80% of those splits, i.e. buildPredictor(...numSplits = 100, featScoreMax=10L)
and callFeatSel(..., fsCutoff=7, fsPctPass=0.8)
.
featSelNet <- lapply(featScores2, function(x) {
callFeatSel(x, fsCutoff=1, fsPctPass=0)
})
Now we use predict()
to classify samples in the independent dataset. We provide the model with feature design rules in groupList
, the list of selected features to use in featSelNet
, the function to convert data into patient similarity networks in makeNets
, as well as the original and validated datasets in brca
and holdout
respectively.
The training data needs to be provided because netDx creates a single patient similarity network with both training and test data. It then uses label propagation to “diffuse” patient labels from training samples to test samples, and labels the latter based on which class they are most similar to.
outDir <- paste(tempdir(), randAlphanumString(),
sep = getFileSep())
if (file.exists(outDir)) unlink(outDir,recursive=TRUE)
dir.create(outDir)
predModel <- suppressMessages(
predict(brca, holdout, groupList,
featSelNet, makeNets,
outDir, verbose = FALSE)
)
## TT_STATUS
## STATUS TEST TRAIN
## LumA 10 220
## notLumA 10 92
## Time difference of 0.1118648 secs
## Time difference of 0.1492043 secs
## Time difference of 0.1071789 secs
## PRED_CLASS
## STATUS LumA notLumA
## LumA 8 2
## notLumA 1 9
Finally we examine how well our model performed, using getPerformance()
.
Compute performance:
perf <- getPerformance(predModel, c("LumA", "notLumA"))
message(sprintf("AUROC=%1.2f", perf$auroc))
## AUROC=0.91
message(sprintf("AUPR=%1.2f", perf$aupr))
## AUPR=0.76
message(sprintf("Accuracy=%1.1f%%", perf$acc))
## Accuracy=85.0%
We plot the AUROC and AUPR curves using plotPerf_multi()
.
plotPerf_multi(list(perf$rocCurve),
plotTitle = sprintf(
"BRCA Validation: %i samples",
nrow(colData(holdout))))
plotPerf_multi(list(perf$prCurve),
plotType = "PR",
plotTitle = sprintf(
"BRCA Validation: %i samples",
nrow(colData(holdout))))
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=C LC_NUMERIC=C
## [3] LC_TIME=C LC_COLLATE=C
## [5] LC_MONETARY=C LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] rhdf5_2.36.0 curatedTCGAData_1.14.0
## [3] MultiAssayExperiment_1.18.0 SummarizedExperiment_1.22.0
## [5] GenomicRanges_1.44.0 GenomeInfoDb_1.28.1
## [7] IRanges_2.26.0 S4Vectors_0.30.0
## [9] MatrixGenerics_1.4.2 matrixStats_0.60.0
## [11] netDx_1.4.3 bigmemory_4.5.36
## [13] Biobase_2.52.0 BiocGenerics_0.38.0
## [15] BiocStyle_2.20.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 R.utils_2.10.1
## [3] tidyselect_1.1.1 RSQLite_2.2.7
## [5] AnnotationDbi_1.54.1 grid_4.1.0
## [7] combinat_0.0-8 BiocParallel_1.26.1
## [9] Rtsne_0.15 RNeXML_2.4.5
## [11] munsell_0.5.0 ScaledMatrix_1.0.0
## [13] base64url_1.4 codetools_0.2-18
## [15] pbdZMQ_0.3-5 withr_2.4.2
## [17] colorspace_2.0-2 filelock_1.0.2
## [19] highr_0.9 knitr_1.33
## [21] dplR_1.7.2 uuid_0.1-4
## [23] zinbwave_1.14.1 SingleCellExperiment_1.14.1
## [25] ROCR_1.0-11 NMF_0.23.0
## [27] labeling_0.4.2 repr_1.1.3
## [29] GenomeInfoDbData_1.2.6 farver_2.1.0
## [31] bit64_4.0.5 vctrs_0.3.8
## [33] generics_0.1.0 xfun_0.25
## [35] BiocFileCache_2.0.0 R6_2.5.0
## [37] doParallel_1.0.16 ggbeeswarm_0.6.0
## [39] netSmooth_1.12.0 rsvd_1.0.5
## [41] RJSONIO_1.3-1.5 locfit_1.5-9.4
## [43] bitops_1.0-7 rhdf5filters_1.4.0
## [45] cachem_1.0.5 DelayedArray_0.18.0
## [47] assertthat_0.2.1 promises_1.2.0.1
## [49] scales_1.1.1 beeswarm_0.4.0
## [51] gtable_0.3.0 phylobase_0.8.10
## [53] beachmat_2.8.1 rlang_0.4.11
## [55] genefilter_1.74.0 splines_4.1.0
## [57] lazyeval_0.2.2 BiocManager_1.30.16
## [59] yaml_2.2.1 reshape2_1.4.4
## [61] backports_1.2.1 httpuv_1.6.2
## [63] tools_4.1.0 bookdown_0.23
## [65] gridBase_0.4-7 ggplot2_3.3.5
## [67] ellipsis_0.3.2 jquerylib_0.1.4
## [69] RColorBrewer_1.1-2 Rcpp_1.0.7
## [71] plyr_1.8.6 base64enc_0.1-3
## [73] sparseMatrixStats_1.4.2 progress_1.2.2
## [75] zlibbioc_1.38.0 purrr_0.3.4
## [77] RCurl_1.98-1.4 prettyunits_1.1.1
## [79] viridis_0.6.1 cluster_2.1.2
## [81] magrittr_2.0.1 magick_2.7.3
## [83] data.table_1.14.0 mime_0.11
## [85] hms_1.1.0 evaluate_0.14
## [87] xtable_1.8-4 XML_3.99-0.7
## [89] gridExtra_2.3 shape_1.4.6
## [91] compiler_4.1.0 scater_1.20.1
## [93] tibble_3.1.3 RCy3_2.12.4
## [95] crayon_1.4.1 R.oo_1.24.0
## [97] htmltools_0.5.1.1 entropy_1.3.0
## [99] later_1.3.0 tidyr_1.1.3
## [101] howmany_0.3-1 DBI_1.1.1
## [103] ExperimentHub_2.0.0 dbplyr_2.1.1
## [105] MASS_7.3-54 rappdirs_0.3.3
## [107] Matrix_1.3-4 ade4_1.7-17
## [109] uchardet_1.1.0 R.methodsS3_1.8.1
## [111] igraph_1.2.6 pkgconfig_2.0.3
## [113] bigmemory.sri_0.1.3 rncl_0.8.4
## [115] registry_0.5-1 locfdr_1.1-8
## [117] signal_0.7-7 IRdisplay_1.0
## [119] scuttle_1.2.1 xml2_1.3.2
## [121] foreach_1.5.1 annotate_1.70.0
## [123] vipor_0.4.5 bslib_0.2.5.1
## [125] rngtools_1.5 pkgmaker_0.32.2
## [127] XVector_0.32.0 stringr_1.4.0
## [129] digest_0.6.27 pracma_2.3.3
## [131] graph_1.70.0 softImpute_1.4-1
## [133] Biostrings_2.60.2 rmarkdown_2.10
## [135] edgeR_3.34.0 DelayedMatrixStats_1.14.2
## [137] curl_4.3.2 kernlab_0.9-29
## [139] shiny_1.6.0 lifecycle_1.0.0
## [141] nlme_3.1-152 jsonlite_1.7.2
## [143] clusterExperiment_2.12.0 Rhdf5lib_1.14.2
## [145] BiocNeighbors_1.10.0 viridisLite_0.4.0
## [147] limma_3.48.3 fansi_0.5.0
## [149] pillar_1.6.2 lattice_0.20-44
## [151] KEGGREST_1.32.0 fastmap_1.1.0
## [153] httr_1.4.2 survival_3.2-12
## [155] interactiveDisplayBase_1.30.0 glue_1.4.2
## [157] png_0.1-7 iterators_1.0.13
## [159] BiocVersion_3.13.1 glmnet_4.1-2
## [161] bit_4.0.4 stringi_1.7.3
## [163] sass_0.4.0 HDF5Array_1.20.0
## [165] blob_1.2.2 AnnotationHub_3.0.1
## [167] BiocSingular_1.8.1 memoise_2.0.0
## [169] IRkernel_1.2 dplyr_1.0.7
## [171] irlba_2.3.3 ape_5.5