1 TL;DR

This code block is not evaluated. Need a breakdown? Look at the following sections.

suppressWarnings(suppressMessages(require(netDx)))
suppressWarnings(suppressMessages(library(curatedTCGAData)))

# fetch data remotely
brca <- suppressMessages(curatedTCGAData("BRCA",c("mRNAArray"),FALSE,
    version="1.1.38"))

# process input variables
staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget

pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50

tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like","Luminal B","HER2-enriched")),
                    which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]

smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
# remove duplicate assays mapped to the same sample
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])

# colData must have ID and STATUS columns
pID <- colData(brca)$patientID
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod

# create grouping rules
groupList <- list()
# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
      age="patient.age_at_initial_pathologic_diagnosis",
       stage="STAGE"
)

# create function to tell netDx how to build features (PSN) from your data
makeNets <- function(dataList, groupList, netDir,...) {
    netList <- c() # initialize before is.null() check
    # make RNA nets (NOTE: the check for is.null() is important!)
    # (Pearson correlation)
    if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) { 
    netList <- makePSN_NamedMatrix(dataList[["BRCA_mRNAArray-20160128"]],
                rownames(dataList[["BRCA_mRNAArray-20160128"]]),
                groupList[["BRCA_mRNAArray-20160128"]],
                netDir,verbose=FALSE, 
                writeProfiles=TRUE,...) 
    }
    
    # make clinical nets (normalized difference)
    netList2 <- c()
    if (!is.null(groupList[["clinical"]])) {
    netList2 <- makePSN_NamedMatrix(dataList$clinical, 
        rownames(dataList$clinical),
        groupList[["clinical"]],netDir,
        simMetric="custom",customFunc=normDiff, # custom function
        writeProfiles=FALSE,
        sparsify=TRUE,verbose=TRUE,...)
    }
    netList <- c(unlist(netList),unlist(netList2))
    return(netList)
}

# run predictor 
set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
    "pred_output",sep=getFileSep())
# To see all messages, remove suppressMessages() and set logging="default".
# To keep all intermediate data, set keepAllData=TRUE
out <- buildPredictor(
      dataList=brca,groupList=groupList,
      makeNetFunc=makeNets,
      outDir=outDir, ## netDx requires absolute path
      numSplits=2L,featScoreMax=2L, featSelCutoff=1L,
      numCores=1L,logging="none",
      keepAllData=FALSE,debugMode=TRUE
   )

# collect results
numSplits <- 2
st <- unique(colData(brca)$STATUS)
acc <- c()         # accuracy
predList <- list() # prediction tables
featScores <- list() # feature scores per class
for (cur in unique(st)) featScores[[cur]] <- list()

for (k in 1:numSplits) { 
    pred <- out[[sprintf("Split%i",k)]][["predictions"]];
    # predictions table
    tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
                     sprintf("%s_SCORE",st))]
    predList[[k]] <- tmp 
    # accuracy
    acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp))
    # feature scores
    for (cur in unique(st)) {
       tmp <- out[[sprintf("Split%i",k)]][["featureScores"]][[cur]]
       colnames(tmp) <- c("PATHWAY_NAME","SCORE")
       featScores[[cur]][[sprintf("Split%i",k)]] <- tmp
    }
}

# plot ROC and PR curve, compute AUROC, AUPR
predPerf <- plotPerf(predList, predClasses=st)
# get table of feature scores for each split and patient label
featScores2 <- lapply(featScores, getNetConsensus)
# identify features that consistently perform well
featSelNet <- lapply(featScores2, function(x) {
    callFeatSel(x, fsCutoff=1, fsPctPass=0)
})

# prepare data for EnrichmentMap plotting of top-scoring features
Emap_res <- getEMapInput_many(featScores2,pathList,
    minScore=1,maxScore=2,pctPass=0,out$inputNets,verbose=FALSE)
gmtFiles <- list()
nodeAttrFiles <- list()

for (g in names(Emap_res)) {
    outFile <- paste(outDir,sprintf("%s_nodeAttrs.txt",g),sep=getFileSep())
    write.table(Emap_res[[g]][["nodeAttrs"]],file=outFile,
        sep="\t",col=TRUE,row=FALSE,quote=FALSE)
    nodeAttrFiles[[g]] <- outFile

    outFile <- paste(outDir,sprintf("%s.gmt",g),sep=getFileSep())
    conn <- suppressWarnings(
         suppressMessages(base::file(outFile,"w")))
    tmp <- Emap_res[[g]][["featureSets"]]
    gmtFiles[[g]] <- outFile

    for (cur in names(tmp)) {
        curr <- sprintf("%s\t%s\t%s", cur,cur,
            paste(tmp[[cur]],collapse="\t"))
        writeLines(curr,con=conn)
    }
close(conn)
}

# This step requires Cytoscape to be installed and running.
###plotEmap(gmtFiles[[1]],nodeAttrFiles[[1]],
###         groupClusters=TRUE, hideNodeLabels=TRUE)

# collect data for integrated PSN
featScores2 <- lapply(featScores, getNetConsensus)
featSelNet <- lapply(featScores2, function(x) {
    callFeatSel(x, fsCutoff=2, fsPctPass=1)
})
topPath <- gsub(".profile","",
        unique(unlist(featSelNet)))
topPath <- gsub("_cont.txt","",topPath)
# create groupList limited to top features
g2 <- list();
for (nm in names(groupList)) {
    cur <- groupList[[nm]]
    idx <- which(names(cur) %in% topPath)
    message(sprintf("%s: %i pathways", nm, length(idx)))
    if (length(idx)>0) g2[[nm]] <- cur[idx]
}

# calculates integrated PSN, calculates grouping statistics,
# and plots integrates PSN. Set plotCytoscape=TRUE if Cytoscape is running.
psn <- suppressMessages(
   plotIntegratedPatientNetwork(brca,
  groupList=g2, makeNetFunc=makeNets,
  aggFun="MEAN",prune_X=0.30,prune_useTop=TRUE,
  numCores=1L,calcShortestPath=TRUE,
  showStats=FALSE,
  verbose=FALSE, plotCytoscape=FALSE)
)

# Visualize integrated patient similarity network as a tSNE plot
tsne <- plot_tSNE(psn$patientSimNetwork_unpruned,colData(brca))

2 Introduction

In this example, we will build a binary breast tumour classifier from clinical data and gene expression data. We will use different rules to create features for each data layer. Specifically:

  • Clinical data: Features are defined directly at the level of variables; similarity is defined by normalized difference.
  • Gene expression data: Features are defined at the level of pathways; similarity is defined by pairwise Pearson correlation.

Feature scoring is performed over multiple random splits of the data into train and blind test partitions. Feature selected networks are those that consistently score highly across the multiple splits (e.g. those that score 9 out of 10 in >=70% of splits).

Conceptually, this is what the higher-level logic looks like for a cross-validation design. In the pseudocode example below, the predictor runs for 100 train/test splits. Within a split, features are scored from 0 to 10. Features scoring >=9 are used to predict labels on the held-out test set (20%).

(Note: these aren’t real function calls; this block just serves to illustrate the concept of the design for our purposes)

numSplits <- 100     # num times to split data into train/blind test samples
featScoreMax <- 10   # num folds for cross-validation, also max score for a network
featSelCutoff <- 9
netScores <- list()  # collect <numSplits> set of netScores
perf <- list()       # collect <numSplits> set of test evaluations

for k in 1:numSplits
 [train, test] <- splitData(80:20) # split data using RNG seed
  featScores[[k]] <- scoreFeatures(train, featScoreMax)
 topFeat[[k]] <- applyFeatCutoff(featScores[[k]])
 perf[[k]] <- collectPerformance(topFeat[[k]], test)
end

3 Setup

suppressWarnings(suppressMessages(require(netDx)))

4 Data

In this example, we use curated data from The Cancer Genome Atlas, through the BioConductor curatedTCGAData package. The goal is to classify a breast tumour into either a Luminal A subtype or otherwise. The predictor will integrate clinical variables selected by the user, along with gene expression data.

Here we load the required packages and download clinical and gene expression data.

suppressMessages(library(curatedTCGAData))

Take a look at the available data without downloading any:

curatedTCGAData(diseaseCode="BRCA", assays="*",dry.run=TRUE,
    version="1.1.38")
## snapshotDate(): 2021-05-18
## See '?curatedTCGAData' for 'diseaseCode' and 'assays' inputs
##     ah_id                                      title file_size
## 1   EH584                       BRCA_CNASeq-20160128      0 Mb
## 2   EH585                       BRCA_CNASNP-20160128    9.8 Mb
## 3   EH586                       BRCA_CNVSNP-20160128    2.8 Mb
## 4   EH588             BRCA_GISTIC_AllByGene-20160128    1.3 Mb
## 5  EH2121                 BRCA_GISTIC_Peaks-20160128      0 Mb
## 6   EH589     BRCA_GISTIC_ThresholdedByGene-20160128    0.4 Mb
## 7  EH2122  BRCA_Methylation_methyl27-20160128_assays   63.2 Mb
## 8  EH2123      BRCA_Methylation_methyl27-20160128_se    0.4 Mb
## 9  EH2124 BRCA_Methylation_methyl450-20160128_assays 2613.2 Mb
## 10 EH2125     BRCA_Methylation_methyl450-20160128_se    6.1 Mb
## 11  EH593                 BRCA_miRNASeqGene-20160128    0.6 Mb
## 12  EH594                    BRCA_mRNAArray-20160128   27.3 Mb
## 13  EH595                     BRCA_Mutation-20160128    4.5 Mb
## 14  EH596              BRCA_RNASeq2GeneNorm-20160128   64.5 Mb
## 15  EH597                   BRCA_RNASeqGene-20160128     30 Mb
## 16  EH598                    BRCA_RPPAArray-20160128    1.6 Mb
##                    rdataclass rdatadateadded rdatadateremoved
## 1            RaggedExperiment     2017-10-10             <NA>
## 2            RaggedExperiment     2017-10-10             <NA>
## 3            RaggedExperiment     2017-10-10             <NA>
## 4        SummarizedExperiment     2017-10-10             <NA>
## 5  RangedSummarizedExperiment     2019-01-09             <NA>
## 6        SummarizedExperiment     2017-10-10             <NA>
## 7        SummarizedExperiment     2019-01-09             <NA>
## 8        SummarizedExperiment     2019-01-09             <NA>
## 9            RaggedExperiment     2019-01-09             <NA>
## 10       SummarizedExperiment     2019-01-09             <NA>
## 11       SummarizedExperiment     2017-10-10             <NA>
## 12       SummarizedExperiment     2017-10-10             <NA>
## 13           RaggedExperiment     2017-10-10             <NA>
## 14       SummarizedExperiment     2017-10-10             <NA>
## 15       SummarizedExperiment     2017-10-10             <NA>
## 16       SummarizedExperiment     2017-10-10             <NA>

We will work only with the mRNA data in this example:

brca <- suppressMessages(curatedTCGAData("BRCA",c("mRNAArray"),FALSE,
    version="1.1.38"))

This next code block prepares the TCGA data. In practice you would do this once, and save the data before running netDx, but we run it here to see an end-to-end example.

staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget

pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50

tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like","Luminal B","HER2-enriched")),
                    which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]

# remove duplicate assays mapped to the same sample
smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])
## harmonizing input:
##   removing 44 sampleMap rows with 'colname' not in colnames of experiments

The important thing is to create ID and STATUS columns in the sample metadata table. netDx uses these to get the patient identifiers and labels, respectively.

pID <- colData(brca)$patientID
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod

5 Design custom patient similarity networks (features)

netDx allows the user to define a custom function that takes patient data and variable groupings as input, and returns a set of patient similarity networks (PSN) as output. The user can customize what datatypes are used, how they are grouped, and what defines patient similarity for a given datatype.

When running the predictor (next section), the user simply passes this custom function as an input variable; i.e. the makeNetFunc parameter when calling buildPredictor().

Note: While netDx provides a high degree of flexibility in achieving your design of choice, it is up to the user to ensure that the design, i.e. the similarity metric and variable groupings, is appropriate for your application. Domain knowledge is almost likely required for good design.

netDx requires that this function take some generic parameters as input. These include:

  • dataList: the patient data, provided as a MultiAssayExperiment object. Refer to the tutorials for MultiAssayExperiment to see how to construct those objects from data.
  • groupList: sets of input data that would correspond to individual networks (e.g. genes grouped into pathways)
  • netDir: the directory where the resulting PSN would be stored.

5.1 dataList

Here the BRCA data is already provided to us as a MultiAssayExperiment object:

summary(brca)
##               Length                Class                 Mode 
##                    1 MultiAssayExperiment                   S4

5.2 groupList

This object tells the predictor how to group units when constructing a network. For examples, genes may be grouped into a network representing a pathway. This object is a list; the names match those of dataList while each value is itself a list and reflects a potential network.

groupList <- list()

# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
## ---------------------------------------
## Fetching http://download.baderlab.org/EM_Genesets/January_01_2018/Human/symbol/Human_AllPathways_January_01_2018_symbol.gmt
## File: 3aeb7097bb288_Human_AllPathways_January_01_2018_symbol.gmt
## Read 3028 pathways in total, internal list has 3009 entries
##  FILTER: sets with num genes in [10, 200]
##    => 971 pathways excluded
##    => 2038 left
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
      age="patient.age_at_initial_pathologic_diagnosis",
       stage="STAGE"
)

So the groupList variable has one entry per data layer:

summary(groupList)
##                         Length Class  Mode
## BRCA_mRNAArray-20160128 3      -none- list
## clinical                2      -none- list

Each entry contains a list, with one entry per feature. Here we have 3 pathway-level features for mRNA and two variable-level features for clinical data.

For example, here are the networks to be created with RNA data. Genes corresponding to pathways are to be grouped into individual network. Such a groupList would create pathway-level networks:

groupList[["BRCA_mRNAArray-20160128"]][1:3]
## $UREA_CYCLE
##  [1] "SLC25A15" "CPS1"     "ASL"      "ARG2"     "SLC25A2"  "OTC"     
##  [7] "NMRAL1"   "NAGS"     "ASS1"     "ARG1"    
## 
## $`CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I`
##  [1] "AGPAT1" "GPD2"   "ABHD5"  "GPAT2"  "CDS1"   "LPCAT3" "LPCAT4" "CDS2"  
##  [9] "AGPAT6" "AGPAT5" "MBOAT7" "AGPAT9" "LCLAT1" "MBOAT2" "AGPAT4" "GPAM"  
## [17] "AGPAT3" "AGPAT2"
## 
## $`SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM`
##  [1] "IPMK"   "INPP5B" "INPP5F" "INPP5D" "MINPP1" "INPP5A" "ITPKA"  "OCRL"  
##  [9] "ITPKC"  "ITPKB"  "SYNJ2"  "INPP5J" "INPP5K" "PTEN"   "IMPA2"  "INPP1" 
## [17] "SYNJ1"  "INPPL1" "IMPA1"  "IMPAD1"

For clinical data, we want to keep each variable as its own network:

head(groupList[["clinical"]])
## $age
## [1] "patient.age_at_initial_pathologic_diagnosis"
## 
## $stage
## [1] "STAGE"

5.3 Define patient similarity for each network

This function is defined by the user and tells the predictor how to create networks from the provided input data.

This function requires dataList,groupList, and netDir as input variables. The residual ... parameter is to pass additional variables to makePSN_NamedMatrix(), notably numCores (number of parallel jobs).

In this particular example, the custom similarity function does the following:

  1. Creates pathway-level networks from RNA data using the default Pearson correlation measure makePSN_NamedMatrix(writeProfiles=TRUE,...)
  2. Creates variable-level networks from clinical data using a custom similarity function of normalized difference: makePSN_NamedMatrix(writeProfiles=FALSE,simMetric="custom",customFunc=normDiff).
makeNets <- function(dataList, groupList, netDir,...) {
    netList <- c() # initialize before is.null() check
    # make RNA nets (NOTE: the check for is.null() is important!)
    # (Pearson correlation)
    if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) { 
    netList <- makePSN_NamedMatrix(dataList[["BRCA_mRNAArray-20160128"]],
                rownames(dataList[["BRCA_mRNAArray-20160128"]]),
                groupList[["BRCA_mRNAArray-20160128"]],
                netDir,verbose=FALSE, 
                writeProfiles=TRUE,...) 
    }
    
    # make clinical nets (normalized difference)
    netList2 <- c()
    if (!is.null(groupList[["clinical"]])) {
    netList2 <- makePSN_NamedMatrix(dataList$clinical, 
        rownames(dataList$clinical),
        groupList[["clinical"]],netDir,
        simMetric="custom",customFunc=normDiff, # custom function
        writeProfiles=FALSE,
        sparsify=TRUE,verbose=TRUE,...)
    }
    netList <- c(unlist(netList),unlist(netList2))
    return(netList)
}

Note: dataList and groupList are generic containers that can contain whatever object the user requires to create PSN. The custom function gives the user complete flexibility in feature design.

6 Build predictor

Finally we call the function that runs the netDx predictor. We provide:

  • number of train/test splits over which to collect feature scores and average performance: numSplits,
  • maximum score for features in one round of feature selection (featScoreMax, set to 10)
  • threshold to call feature-selected networks for each train/test split (featSelCutoff); only features scoring this value or higher will be used to classify test patients, and
  • the information to create the PSN, including patient data (dataList), how variables are to be grouped into networks (groupList) and the custom function to generate features (makeNetFunc).

Change numCores to match the number of cores available on your machine for parallel processing.

The call below runs 2 train/test splits. Within each split, it:

  • splits data into train/test using the default split of 80:20
  • score2 networks between 0 to 2 (i.e. featScoreMax=2)
  • uses networks that score >=1 out of 2 (featSelCutoff) to classify test samples for that split.

These are unrealistically low values set so the example will run fast. In practice a good starting point is featScoreMax=10, featSelCutoff=9 and numSplits=100, but these parameters depend on the sample sizes in the dataset and heterogeneity of the samples.

set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
    "pred_output",sep=getFileSep())
# set keepAllData=TRUE to not delete at the end of the predictor run.
# This can be useful for debugging.
out <- buildPredictor(
      dataList=brca,groupList=groupList,
      makeNetFunc=makeNets,
      outDir=outDir, ## netDx requires absolute path
      numSplits=2L,featScoreMax=2L,
      featSelCutoff=1L,
      numCores=1L,debugMode=TRUE,
      logging="none")
## Predictor started at:
## 2021-08-19 07:53:15
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/test.log
##  Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.3 s
##  Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.2 s
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/LumA_query --netdx-flag true
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/notLumA_query --netdx-flag true
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/test.log
##  Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.5 s
##  Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.3 s
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/LumA_query --netdx-flag true
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/notLumA_query --netdx-flag true
## Predictor completed at:
## 2021-08-19 07:58:44

7 Examine output

The results are stored in the list object returned by the buildPredictor() call. This list contains:

  • inputNets: all input networks that the model started with.
  • Split<i>: a list with results for each train-test split
    • predictions: real and predicted labels for test patients
    • accuracy: percent accuracy of predictions
    • featureScores: feature scores for each label (list with g entries, where g is number of patient labels). Each entry contains the feature selection scores for the corresponding label.
    • featureSelected: vector of features that pass feature selection. List of length g, with one entry per label.
summary(out)
##           Length Class  Mode     
## inputNets 10     -none- character
## Split1     4     -none- list     
## Split2     4     -none- list
summary(out$Split1)
##                 Length Class      Mode   
## featureScores      2   -none-     list   
## featureSelected    2   -none-     list   
## predictions     2692   data.frame list   
## accuracy           1   -none-     numeric

7.1 Reformat results for further analysis

This code collects different components of model output to examine the results.

numSplits <- 2
st <- unique(colData(brca)$STATUS)
acc <- c()         # accuracy
predList <- list() # prediction tables

featScores <- list() # feature scores per class
for (cur in unique(st)) featScores[[cur]] <- list()

for (k in 1:numSplits) { 
    pred <- out[[sprintf("Split%i",k)]][["predictions"]];
    # predictions table
    tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
                     sprintf("%s_SCORE",st))]
    predList[[k]] <- tmp 
    # accuracy
    acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp))
    # feature scores
    for (cur in unique(st)) {
       tmp <- out[[sprintf("Split%i",k)]][["featureScores"]][[cur]]
       colnames(tmp) <- c("PATHWAY_NAME","SCORE")
       featScores[[cur]][[sprintf("Split%i",k)]] <- tmp
    }
}

7.2 Compute model performance

After compiling the data above, plot accuracy for each train/test split:

print(acc)
## [1] 0.8208955 0.8208955

Create a ROC curve, a precision-recall curve, and plot average AUROC and AUPR:

predPerf <- plotPerf(predList, predClasses=st)

7.3 Examine feature scores and consistently high-scoring features

Use getNetConsensus() to convert the list data structure into a single table, one per patient label. The rows show train/test splits and the columns show features that consistently perform well.

We then use callFeatSel() to identi