1 TL;DR

This code block is not evaluated. Need a breakdown? Look at the following sections.

suppressWarnings(suppressMessages(require(netDx)))
suppressWarnings(suppressMessages(library(curatedTCGAData)))

# fetch data remotely
brca <- suppressMessages(curatedTCGAData("BRCA",c("mRNAArray"),FALSE,
    version="1.1.38"))

# process input variables
staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget

pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50

tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like","Luminal B","HER2-enriched")),
                    which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]

smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
# remove duplicate assays mapped to the same sample
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])

# colData must have ID and STATUS columns
pID <- colData(brca)$patientID
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod

# create grouping rules
groupList <- list()
# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
      age="patient.age_at_initial_pathologic_diagnosis",
       stage="STAGE"
)

# create function to tell netDx how to build features (PSN) from your data
makeNets <- function(dataList, groupList, netDir,...) {
    netList <- c() # initialize before is.null() check
    # make RNA nets (NOTE: the check for is.null() is important!)
    # (Pearson correlation)
    if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) { 
    netList <- makePSN_NamedMatrix(dataList[["BRCA_mRNAArray-20160128"]],
                rownames(dataList[["BRCA_mRNAArray-20160128"]]),
                groupList[["BRCA_mRNAArray-20160128"]],
                netDir,verbose=FALSE, 
                writeProfiles=TRUE,...) 
    }
    
    # make clinical nets (normalized difference)
    netList2 <- c()
    if (!is.null(groupList[["clinical"]])) {
    netList2 <- makePSN_NamedMatrix(dataList$clinical, 
        rownames(dataList$clinical),
        groupList[["clinical"]],netDir,
        simMetric="custom",customFunc=normDiff, # custom function
        writeProfiles=FALSE,
        sparsify=TRUE,verbose=TRUE,...)
    }
    netList <- c(unlist(netList),unlist(netList2))
    return(netList)
}

# run predictor 
set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
    "pred_output",sep=getFileSep())
# To see all messages, remove suppressMessages() and set logging="default".
# To keep all intermediate data, set keepAllData=TRUE
out <- buildPredictor(
      dataList=brca,groupList=groupList,
      makeNetFunc=makeNets,
      outDir=outDir, ## netDx requires absolute path
      numSplits=2L,featScoreMax=2L, featSelCutoff=1L,
      numCores=1L,logging="none",
      keepAllData=FALSE,debugMode=TRUE
   )

# collect results
numSplits <- 2
st <- unique(colData(brca)$STATUS)
acc <- c()         # accuracy
predList <- list() # prediction tables
featScores <- list() # feature scores per class
for (cur in unique(st)) featScores[[cur]] <- list()

for (k in 1:numSplits) { 
    pred <- out[[sprintf("Split%i",k)]][["predictions"]];
    # predictions table
    tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
                     sprintf("%s_SCORE",st))]
    predList[[k]] <- tmp 
    # accuracy
    acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp))
    # feature scores
    for (cur in unique(st)) {
       tmp <- out[[sprintf("Split%i",k)]][["featureScores"]][[cur]]
       colnames(tmp) <- c("PATHWAY_NAME","SCORE")
       featScores[[cur]][[sprintf("Split%i",k)]] <- tmp
    }
}

# plot ROC and PR curve, compute AUROC, AUPR
predPerf <- plotPerf(predList, predClasses=st)
# get table of feature scores for each split and patient label
featScores2 <- lapply(featScores, getNetConsensus)
# identify features that consistently perform well
featSelNet <- lapply(featScores2, function(x) {
    callFeatSel(x, fsCutoff=1, fsPctPass=0)
})

# prepare data for EnrichmentMap plotting of top-scoring features
Emap_res <- getEMapInput_many(featScores2,pathList,
    minScore=1,maxScore=2,pctPass=0,out$inputNets,verbose=FALSE)
gmtFiles <- list()
nodeAttrFiles <- list()

for (g in names(Emap_res)) {
    outFile <- paste(outDir,sprintf("%s_nodeAttrs.txt",g),sep=getFileSep())
    write.table(Emap_res[[g]][["nodeAttrs"]],file=outFile,
        sep="\t",col=TRUE,row=FALSE,quote=FALSE)
    nodeAttrFiles[[g]] <- outFile

    outFile <- paste(outDir,sprintf("%s.gmt",g),sep=getFileSep())
    conn <- suppressWarnings(
         suppressMessages(base::file(outFile,"w")))
    tmp <- Emap_res[[g]][["featureSets"]]
    gmtFiles[[g]] <- outFile

    for (cur in names(tmp)) {
        curr <- sprintf("%s\t%s\t%s", cur,cur,
            paste(tmp[[cur]],collapse="\t"))
        writeLines(curr,con=conn)
    }
close(conn)
}

# This step requires Cytoscape to be installed and running.
###plotEmap(gmtFiles[[1]],nodeAttrFiles[[1]],
###         groupClusters=TRUE, hideNodeLabels=TRUE)

# collect data for integrated PSN
featScores2 <- lapply(featScores, getNetConsensus)
featSelNet <- lapply(featScores2, function(x) {
    callFeatSel(x, fsCutoff=2, fsPctPass=1)
})
topPath <- gsub(".profile","",
        unique(unlist(featSelNet)))
topPath <- gsub("_cont.txt","",topPath)
# create groupList limited to top features
g2 <- list();
for (nm in names(groupList)) {
    cur <- groupList[[nm]]
    idx <- which(names(cur) %in% topPath)
    message(sprintf("%s: %i pathways", nm, length(idx)))
    if (length(idx)>0) g2[[nm]] <- cur[idx]
}

# calculates integrated PSN, calculates grouping statistics,
# and plots integrates PSN. Set plotCytoscape=TRUE if Cytoscape is running.
psn <- suppressMessages(
   plotIntegratedPatientNetwork(brca,
  groupList=g2, makeNetFunc=makeNets,
  aggFun="MEAN",prune_X=0.30,prune_useTop=TRUE,
  numCores=1L,calcShortestPath=TRUE,
  showStats=FALSE,
  verbose=FALSE, plotCytoscape=FALSE)
)

# Visualize integrated patient similarity network as a tSNE plot
tsne <- plot_tSNE(psn$patientSimNetwork_unpruned,colData(brca))

2 Introduction

In this example, we will build a binary breast tumour classifier from clinical data and gene expression data. We will use different rules to create features for each data layer. Specifically:

Clinical data: Features are defined directly at the level of variables; similarity is defined by normalized difference.
Gene expression data: Features are defined at the level of pathways; similarity is defined by pairwise Pearson correlation.

Feature scoring is performed over multiple random splits of the data into train and blind test partitions. Feature selected networks are those that consistently score highly across the multiple splits (e.g. those that score 9 out of 10 in >=70% of splits).

Conceptually, this is what the higher-level logic looks like for a cross-validation design. In the pseudocode example below, the predictor runs for 100 train/test splits. Within a split, features are scored from 0 to 10. Features scoring >=9 are used to predict labels on the held-out test set (20%).

(Note: these aren’t real function calls; this block just serves to illustrate the concept of the design for our purposes)

numSplits <- 100     # num times to split data into train/blind test samples
featScoreMax <- 10   # num folds for cross-validation, also max score for a network
featSelCutoff <- 9
netScores <- list()  # collect <numSplits> set of netScores
perf <- list()       # collect <numSplits> set of test evaluations

for k in 1:numSplits
 [train, test] <- splitData(80:20) # split data using RNG seed
  featScores[[k]] <- scoreFeatures(train, featScoreMax)
 topFeat[[k]] <- applyFeatCutoff(featScores[[k]])
 perf[[k]] <- collectPerformance(topFeat[[k]], test)
end

3 Setup

suppressWarnings(suppressMessages(require(netDx)))

4 Data

In this example, we use curated data from The Cancer Genome Atlas, through the BioConductor curatedTCGAData package. The goal is to classify a breast tumour into either a Luminal A subtype or otherwise. The predictor will integrate clinical variables selected by the user, along with gene expression data.

Here we load the required packages and download clinical and gene expression data.

suppressMessages(library(curatedTCGAData))

Take a look at the available data without downloading any:

curatedTCGAData(diseaseCode="BRCA", assays="*",dry.run=TRUE,
    version="1.1.38")

## snapshotDate(): 2021-05-18

## See '?curatedTCGAData' for 'diseaseCode' and 'assays' inputs

##     ah_id                                      title file_size
## 1   EH584                       BRCA_CNASeq-20160128      0 Mb
## 2   EH585                       BRCA_CNASNP-20160128    9.8 Mb
## 3   EH586                       BRCA_CNVSNP-20160128    2.8 Mb
## 4   EH588             BRCA_GISTIC_AllByGene-20160128    1.3 Mb
## 5  EH2121                 BRCA_GISTIC_Peaks-20160128      0 Mb
## 6   EH589     BRCA_GISTIC_ThresholdedByGene-20160128    0.4 Mb
## 7  EH2122  BRCA_Methylation_methyl27-20160128_assays   63.2 Mb
## 8  EH2123      BRCA_Methylation_methyl27-20160128_se    0.4 Mb
## 9  EH2124 BRCA_Methylation_methyl450-20160128_assays 2613.2 Mb
## 10 EH2125     BRCA_Methylation_methyl450-20160128_se    6.1 Mb
## 11  EH593                 BRCA_miRNASeqGene-20160128    0.6 Mb
## 12  EH594                    BRCA_mRNAArray-20160128   27.3 Mb
## 13  EH595                     BRCA_Mutation-20160128    4.5 Mb
## 14  EH596              BRCA_RNASeq2GeneNorm-20160128   64.5 Mb
## 15  EH597                   BRCA_RNASeqGene-20160128     30 Mb
## 16  EH598                    BRCA_RPPAArray-20160128    1.6 Mb
##                    rdataclass rdatadateadded rdatadateremoved
## 1            RaggedExperiment     2017-10-10             <NA>
## 2            RaggedExperiment     2017-10-10             <NA>
## 3            RaggedExperiment     2017-10-10             <NA>
## 4        SummarizedExperiment     2017-10-10             <NA>
## 5  RangedSummarizedExperiment     2019-01-09             <NA>
## 6        SummarizedExperiment     2017-10-10             <NA>
## 7        SummarizedExperiment     2019-01-09             <NA>
## 8        SummarizedExperiment     2019-01-09             <NA>
## 9            RaggedExperiment     2019-01-09             <NA>
## 10       SummarizedExperiment     2019-01-09             <NA>
## 11       SummarizedExperiment     2017-10-10             <NA>
## 12       SummarizedExperiment     2017-10-10             <NA>
## 13           RaggedExperiment     2017-10-10             <NA>
## 14       SummarizedExperiment     2017-10-10             <NA>
## 15       SummarizedExperiment     2017-10-10             <NA>
## 16       SummarizedExperiment     2017-10-10             <NA>

We will work only with the mRNA data in this example:

brca <- suppressMessages(curatedTCGAData("BRCA",c("mRNAArray"),FALSE,
    version="1.1.38"))

This next code block prepares the TCGA data. In practice you would do this once, and save the data before running netDx, but we run it here to see an end-to-end example.

staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget

pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50

tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like","Luminal B","HER2-enriched")),
                    which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]

# remove duplicate assays mapped to the same sample
smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])

## harmonizing input:
##   removing 44 sampleMap rows with 'colname' not in colnames of experiments

The important thing is to create ID and STATUS columns in the sample metadata table. netDx uses these to get the patient identifiers and labels, respectively.

pID <- colData(brca)$patientID
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod

5 Design custom patient similarity networks (features)

netDx allows the user to define a custom function that takes patient data and variable groupings as input, and returns a set of patient similarity networks (PSN) as output. The user can customize what datatypes are used, how they are grouped, and what defines patient similarity for a given datatype.

When running the predictor (next section), the user simply passes this custom function as an input variable; i.e. the makeNetFunc parameter when calling buildPredictor().

Note: While netDx provides a high degree of flexibility in achieving your design of choice, it is up to the user to ensure that the design, i.e. the similarity metric and variable groupings, is appropriate for your application. Domain knowledge is almost likely required for good design.

netDx requires that this function take some generic parameters as input. These include:

dataList: the patient data, provided as a MultiAssayExperiment object. Refer to the tutorials for MultiAssayExperiment to see how to construct those objects from data.
groupList: sets of input data that would correspond to individual networks (e.g. genes grouped into pathways)
netDir: the directory where the resulting PSN would be stored.

5.1 dataList

Here the BRCA data is already provided to us as a MultiAssayExperiment object:

summary(brca)

##               Length                Class                 Mode 
##                    1 MultiAssayExperiment                   S4

5.2 groupList

This object tells the predictor how to group units when constructing a network. For examples, genes may be grouped into a network representing a pathway. This object is a list; the names match those of dataList while each value is itself a list and reflects a potential network.

groupList <- list()

# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))

## ---------------------------------------

## Fetching http://download.baderlab.org/EM_Genesets/January_01_2018/Human/symbol/Human_AllPathways_January_01_2018_symbol.gmt

## File: 3aeb7097bb288_Human_AllPathways_January_01_2018_symbol.gmt

## Read 3028 pathways in total, internal list has 3009 entries

##  FILTER: sets with num genes in [10, 200]

##    => 971 pathways excluded
##    => 2038 left

groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
      age="patient.age_at_initial_pathologic_diagnosis",
       stage="STAGE"
)

So the groupList variable has one entry per data layer:

summary(groupList)

##                         Length Class  Mode
## BRCA_mRNAArray-20160128 3      -none- list
## clinical                2      -none- list

Each entry contains a list, with one entry per feature. Here we have 3 pathway-level features for mRNA and two variable-level features for clinical data.

For example, here are the networks to be created with RNA data. Genes corresponding to pathways are to be grouped into individual network. Such a groupList would create pathway-level networks:

groupList[["BRCA_mRNAArray-20160128"]][1:3]

## $UREA_CYCLE
##  [1] "SLC25A15" "CPS1"     "ASL"      "ARG2"     "SLC25A2"  "OTC"     
##  [7] "NMRAL1"   "NAGS"     "ASS1"     "ARG1"    
## 
## $`CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I`
##  [1] "AGPAT1" "GPD2"   "ABHD5"  "GPAT2"  "CDS1"   "LPCAT3" "LPCAT4" "CDS2"  
##  [9] "AGPAT6" "AGPAT5" "MBOAT7" "AGPAT9" "LCLAT1" "MBOAT2" "AGPAT4" "GPAM"  
## [17] "AGPAT3" "AGPAT2"
## 
## $`SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM`
##  [1] "IPMK"   "INPP5B" "INPP5F" "INPP5D" "MINPP1" "INPP5A" "ITPKA"  "OCRL"  
##  [9] "ITPKC"  "ITPKB"  "SYNJ2"  "INPP5J" "INPP5K" "PTEN"   "IMPA2"  "INPP1" 
## [17] "SYNJ1"  "INPPL1" "IMPA1"  "IMPAD1"

For clinical data, we want to keep each variable as its own network:

head(groupList[["clinical"]])

## $age
## [1] "patient.age_at_initial_pathologic_diagnosis"
## 
## $stage
## [1] "STAGE"

5.3 Define patient similarity for each network

This function is defined by the user and tells the predictor how to create networks from the provided input data.

This function requires dataList,groupList, and netDir as input variables. The residual ... parameter is to pass additional variables to makePSN_NamedMatrix(), notably numCores (number of parallel jobs).

In this particular example, the custom similarity function does the following:

Creates pathway-level networks from RNA data using the default Pearson correlation measure makePSN_NamedMatrix(writeProfiles=TRUE,...)
Creates variable-level networks from clinical data using a custom similarity function of normalized difference: makePSN_NamedMatrix(writeProfiles=FALSE,simMetric="custom",customFunc=normDiff).

makeNets <- function(dataList, groupList, netDir,...) {
    netList <- c() # initialize before is.null() check
    # make RNA nets (NOTE: the check for is.null() is important!)
    # (Pearson correlation)
    if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) { 
    netList <- makePSN_NamedMatrix(dataList[["BRCA_mRNAArray-20160128"]],
                rownames(dataList[["BRCA_mRNAArray-20160128"]]),
                groupList[["BRCA_mRNAArray-20160128"]],
                netDir,verbose=FALSE, 
                writeProfiles=TRUE,...) 
    }
    
    # make clinical nets (normalized difference)
    netList2 <- c()
    if (!is.null(groupList[["clinical"]])) {
    netList2 <- makePSN_NamedMatrix(dataList$clinical, 
        rownames(dataList$clinical),
        groupList[["clinical"]],netDir,
        simMetric="custom",customFunc=normDiff, # custom function
        writeProfiles=FALSE,
        sparsify=TRUE,verbose=TRUE,...)
    }
    netList <- c(unlist(netList),unlist(netList2))
    return(netList)
}

Note: dataList and groupList are generic containers that can contain whatever object the user requires to create PSN. The custom function gives the user complete flexibility in feature design.

6 Build predictor

Finally we call the function that runs the netDx predictor. We provide:

number of train/test splits over which to collect feature scores and average performance: numSplits,
maximum score for features in one round of feature selection (featScoreMax, set to 10)
threshold to call feature-selected networks for each train/test split (featSelCutoff); only features scoring this value or higher will be used to classify test patients, and
the information to create the PSN, including patient data (dataList), how variables are to be grouped into networks (groupList) and the custom function to generate features (makeNetFunc).

Change numCores to match the number of cores available on your machine for parallel processing.

The call below runs 2 train/test splits. Within each split, it:

splits data into train/test using the default split of 80:20
score2 networks between 0 to 2 (i.e. featScoreMax=2)
uses networks that score >=1 out of 2 (featSelCutoff) to classify test samples for that split.

These are unrealistically low values set so the example will run fast. In practice a good starting point is featScoreMax=10, featSelCutoff=9 and numSplits=100, but these parameters depend on the sample sizes in the dataset and heterogeneity of the samples.

set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
    "pred_output",sep=getFileSep())
# set keepAllData=TRUE to not delete at the end of the predictor run.
# This can be useful for debugging.
out <- buildPredictor(
      dataList=brca,groupList=groupList,
      makeNetFunc=makeNets,
      outDir=outDir, ## netDx requires absolute path
      numSplits=2L,featScoreMax=2L,
      featSelCutoff=1L,
      numCores=1L,debugMode=TRUE,
      logging="none")

## Predictor started at:

## 2021-08-19 07:53:15

## Pearson similarity chosen - enforcing min. 5 patients per net.

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/colours.txt

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/test.log

##  Scoring features

## Java 11 or later detected

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results/CV_2.query --netdx-flag true

## QueryRunner time taken: 1.3 s

##  Scoring features

## Java 11 or later detected

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results/CV_2.query --netdx-flag true

## QueryRunner time taken: 1.2 s

## Pearson similarity chosen - enforcing min. 5 patients per net.

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/colours.txt

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/test.log

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/LumA_query --netdx-flag true

## Pearson similarity chosen - enforcing min. 5 patients per net.

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/colours.txt

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/test.log

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/notLumA_query --netdx-flag true

## Pearson similarity chosen - enforcing min. 5 patients per net.

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/colours.txt

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/test.log

##  Scoring features

## Java 11 or later detected

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results/CV_2.query --netdx-flag true

## QueryRunner time taken: 1.5 s

##  Scoring features

## Java 11 or later detected

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results/CV_2.query --netdx-flag true

## QueryRunner time taken: 1.3 s

## Pearson similarity chosen - enforcing min. 5 patients per net.

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/colours.txt

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/test.log

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/LumA_query --netdx-flag true

## Pearson similarity chosen - enforcing min. 5 patients per net.

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies

## Making Java call

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/colours.txt

## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/test.log

## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/notLumA_query --netdx-flag true

## Predictor completed at:

## 2021-08-19 07:58:44

7 Examine output

The results are stored in the list object returned by the buildPredictor() call. This list contains:

inputNets: all input networks that the model started with.
Split<i>: a list with results for each train-test split
- predictions: real and predicted labels for test patients
- accuracy: percent accuracy of predictions
- featureScores: feature scores for each label (list with g entries, where g is number of patient labels). Each entry contains the feature selection scores for the corresponding label.
- featureSelected: vector of features that pass feature selection. List of length g, with one entry per label.

summary(out)

##           Length Class  Mode     
## inputNets 10     -none- character
## Split1     4     -none- list     
## Split2     4     -none- list

summary(out$Split1)

##                 Length Class      Mode   
## featureScores      2   -none-     list   
## featureSelected    2   -none-     list   
## predictions     2692   data.frame list   
## accuracy           1   -none-     numeric

7.1 Reformat results for further analysis

This code collects different components of model output to examine the results.

numSplits <- 2
st <- unique(colData(brca)$STATUS)
acc <- c()         # accuracy
predList <- list() # prediction tables

featScores <- list() # feature scores per class
for (cur in unique(st)) featScores[[cur]] <- list()

for (k in 1:numSplits) { 
    pred <- out[[sprintf("Split%i",k)]][["predictions"]];
    # predictions table
    tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
                     sprintf("%s_SCORE",st))]
    predList[[k]] <- tmp 
    # accuracy
    acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp))
    # feature scores
    for (cur in unique(st)) {
       tmp <- out[[sprintf("Split%i",k)]][["featureScores"]][[cur]]
       colnames(tmp) <- c("PATHWAY_NAME","SCORE")
       featScores[[cur]][[sprintf("Split%i",k)]] <- tmp
    }
}

7.2 Compute model performance

After compiling the data above, plot accuracy for each train/test split:

print(acc)

## [1] 0.8208955 0.8208955

Create a ROC curve, a precision-recall curve, and plot average AUROC and AUPR:

predPerf <- plotPerf(predList, predClasses=st)

7.3 Examine feature scores and consistently high-scoring features

Use getNetConsensus() to convert the list data structure into a single table, one per patient label. The rows show train/test splits and the columns show features that consistently perform well.

We then use callFeatSel() to identify features that consistently perform well across the various train/test splits. Because this is a toy example, we set the bar very low to get some features. Here we accept a feature if it scores 1 or higher (fsCutoff=1) in even one split (fsPctPass=0.05), setting the latter to a low positive fraction.

featScores2 <- lapply(featScores, getNetConsensus)
summary(featScores2)

##         Length Class      Mode
## LumA    3      data.frame list
## notLumA 3      data.frame list

head(featScores2[["LumA"]])

##                                                                     PATHWAY_NAME
## 1                                      CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I.profile
## 2 SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile
## 3                                                             UREA_CYCLE.profile
## 4                                                                   age_cont.txt
## 5                                                                 stage_cont.txt
##   Split1 Split2
## 1      2      2
## 2      2      2
## 3      2      2
## 4      1      1
## 5     NA      1

In practice, a recommended setting is fsCutoff=9 and fsPctPass=0.7 to get features that score at least 9 (out of 10) in at least 70% of the train/test splits.

featSelNet <- lapply(featScores2, function(x) {
    callFeatSel(x, fsCutoff=1, fsPctPass=0)
})
print(head(featScores2[["LumA"]]))

##                                                                     PATHWAY_NAME
## 1                                      CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I.profile
## 2 SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile
## 3                                                             UREA_CYCLE.profile
## 4                                                                   age_cont.txt
## 5                                                                 stage_cont.txt
##   Split1 Split2
## 1      2      2
## 2      2      2
## 3      2      2
## 4      1      1
## 5     NA      1

7.4 Visualize EnrichmentMap

An EnrichmentMap is a network-based visualization of pathway connectivity and is used in netDx to visualize themes in predictive pathway-based features. It is used in conjunction with AutoAnnotate to identify clusters, and apply auto-generated labels to these. For more information, see the EnrichmentMap website at baderlab.org.

Use getEMapInput_many() to create the input that helps generate the EnrichmentMap in Cytoscape.

Emap_res <- getEMapInput_many(featScores2,pathList,
    minScore=1,maxScore=2,pctPass=0,out$inputNets,verbose=FALSE)

Write the results to files that Cytoscape can read in:

gmtFiles <- list()
nodeAttrFiles <- list()

for (g in names(Emap_res)) {
    outFile <- paste(outDir,sprintf("%s_nodeAttrs.txt",g),sep=getFileSep())
    write.table(Emap_res[[g]][["nodeAttrs"]],file=outFile,
        sep="\t",col=TRUE,row=FALSE,quote=FALSE)
    nodeAttrFiles[[g]] <- outFile

    outFile <- paste(outDir,sprintf("%s.gmt",g),sep=getFileSep())
    conn <- suppressWarnings(
         suppressMessages(base::file(outFile,"w")))
    tmp <- Emap_res[[g]][["featureSets"]]
    gmtFiles[[g]] <- outFile

    for (cur in names(tmp)) {
        curr <- sprintf("%s\t%s\t%s", cur,cur,
            paste(tmp[[cur]],collapse="\t"))
        writeLines(curr,con=conn)
    }
close(conn)
}

Finally, plot the EnrichmentMap. This step requires Cytoscape to be installed, along with the EnrichmentMap and AutoAnnotate apps. It also requires the Cytoscape application to be open and running on the machine running the code. This block is commented out for automatic builds on BioConductor, but a screenshot of the intended result is shown below.

###plotEmap(gmtFiles[[1]],nodeAttrFiles[[1]],
###         groupClusters=TRUE, hideNodeLabels=TRUE)

This example EnrichmentMap isn’t terribly exciting because of the low number of pathway features permitted, the upper bound on feature selection scores and low number of train/test splits. But hopefully it serves its purpose to be illustrative.

EnrichmentMap generated from example in this vignette. The small number of nodes reflects the limited number of pathways provided to the model, and also reduced parameter values for model building.

Here is an example of an EnrichmentMap generated by running the above predictor with more real-world parameter values, and all available pathways:

EnrichmentMap from the same data but using all pathways, more train/test splits and higher range of feature scores.

8 Visualize integrated patient similarity network based on top features

We can apply a threshold to define predictive features, and integrate these into a single patient similarity network. Such a network is useful for downstream operations such as ascertaining whether or not classes are significantly separated and visualization.

Here we define predictive features as those scoring 3 out of 3 in all train/test splits.

featScores2 <- lapply(featScores, getNetConsensus)
featSelNet <- lapply(featScores2, function(x) {
    callFeatSel(x, fsCutoff=2, fsPctPass=1)
})

We can examine the features:

print(featSelNet)

## $LumA
## [1] "CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I.profile"                                     
## [2] "SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile"
## [3] "UREA_CYCLE.profile"                                                            
## 
## $notLumA
## [1] "SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile"
## [2] "UREA_CYCLE.profile"

Create a new groupList limited to top features:

topPath <- gsub(".profile","",
        unique(unlist(featSelNet)))
topPath <- gsub("_cont.txt","",topPath)
# create groupList limited to top features
g2 <- list();
for (nm in names(groupList)) {
    cur <- groupList[[nm]]
    idx <- which(names(cur) %in% topPath)
    message(sprintf("%s: %i pathways", nm, length(idx)))
    if (length(idx)>0) g2[[nm]] <- cur[idx]
}

## BRCA_mRNAArray-20160128: 3 pathways

## clinical: 0 pathways

Plot the integrated patient network based on the features selected above. Note that at this stage, the similarity measure is inverted into a dissimilarity measure so that nodes with greater similarity are closer (have smaller distance or dissimilarity) in the final network.

In the example below, the networks are integrated by taking the mean of the edge weights (aggFun="MEAN"). For the plotting we retain only the top 5% of the strongest edges (topX=0.05).

By setting calcShortestPath=TRUE, the function will also compute the pairwise shortest path for within- and across-group nodes. The result is shown as a set of violin plots and a one-sided Wilcoxon-Mann-Whitney test is used to assign significance.

As with plotEMap(), this method must be run on a computer with Cytoscape installed and running. For the purposes of this example, plotCytoscape is set to FALSE and a screenshot of the resulting network is provided below. To plot in Cytoscape, set plotCytoscape=TRUE.

psn <- suppressMessages(
   plotIntegratedPatientNetwork(brca,
  groupList=g2, makeNetFunc=makeNets,
  aggFun="MEAN",prune_pctX=0.30,prune_useTop=TRUE,
  numCores=1L,calcShortestPath=TRUE,
  showStats=FALSE,
  verbose=FALSE, plotCytoscape=FALSE)
)

## Warning in dir.create(paste(netDir, "profiles", sep = fsep)): '/tmp/RtmpfpXO2u/
## profiles' already exists

Patient network after integrating features that scored 2 out of 2 in all train-test splits. For visualization only the top 10% strongest edges are shown. Nodes are patients, and edges are average distance across all features passing feature selection. Green indicates “LumA” status and orange indicates “nonLumA” status.

The integrated PSN can also be visualized as a tSNE plot:

tsne <- plot_tSNE(psn$patientSimNetwork_unpruned,colData(brca))

## * Making symmetric matrix

## * Running tSNE

## * Plotting

summary(tsne)

##                     Length Class  Mode   
## N                     1    -none- numeric
## Y                   662    -none- numeric
## costs               331    -none- numeric
## itercosts            20    -none- numeric
## origD                 1    -none- numeric
## perplexity            1    -none- numeric
## theta                 1    -none- numeric
## max_iter              1    -none- numeric
## stop_lying_iter       1    -none- numeric
## mom_switch_iter       1    -none- numeric
## momentum              1    -none- numeric
## final_momentum        1    -none- numeric
## eta                   1    -none- numeric
## exaggeration_factor   1    -none- numeric

class(tsne)

## [1] "list"

9 sessionInfo

sessionInfo()

## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=C                 LC_NUMERIC=C              
##  [3] LC_TIME=C                  LC_COLLATE=C              
##  [5] LC_MONETARY=C              LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] curatedTCGAData_1.14.0      MultiAssayExperiment_1.18.0
##  [3] SummarizedExperiment_1.22.0 GenomicRanges_1.44.0       
##  [5] GenomeInfoDb_1.28.1         IRanges_2.26.0             
##  [7] S4Vectors_0.30.0            MatrixGenerics_1.4.2       
##  [9] matrixStats_0.60.0          netDx_1.4.3                
## [11] bigmemory_4.5.36            Biobase_2.52.0             
## [13] BiocGenerics_0.38.0         BiocStyle_2.20.2           
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.2                    R.utils_2.10.1               
##   [3] tidyselect_1.1.1              RSQLite_2.2.7                
##   [5] AnnotationDbi_1.54.1          grid_4.1.0                   
##   [7] combinat_0.0-8                BiocParallel_1.26.1          
##   [9] Rtsne_0.15                    RNeXML_2.4.5                 
##  [11] munsell_0.5.0                 ScaledMatrix_1.0.0           
##  [13] base64url_1.4                 codetools_0.2-18             
##  [15] pbdZMQ_0.3-5                  withr_2.4.2                  
##  [17] colorspace_2.0-2              filelock_1.0.2               
##  [19] highr_0.9                     knitr_1.33                   
##  [21] dplR_1.7.2                    uuid_0.1-4                   
##  [23] zinbwave_1.14.1               SingleCellExperiment_1.14.1  
##  [25] ROCR_1.0-11                   NMF_0.23.0                   
##  [27] labeling_0.4.2                repr_1.1.3                   
##  [29] GenomeInfoDbData_1.2.6        farver_2.1.0                 
##  [31] bit64_4.0.5                   rhdf5_2.36.0                 
##  [33] vctrs_0.3.8                   generics_0.1.0               
##  [35] xfun_0.25                     BiocFileCache_2.0.0          
##  [37] R6_2.5.0                      doParallel_1.0.16            
##  [39] ggbeeswarm_0.6.0              netSmooth_1.12.0             
##  [41] rsvd_1.0.5                    RJSONIO_1.3-1.5              
##  [43] locfit_1.5-9.4                bitops_1.0-7                 
##  [45] rhdf5filters_1.4.0            cachem_1.0.5                 
##  [47] DelayedArray_0.18.0           assertthat_0.2.1             
##  [49] promises_1.2.0.1              scales_1.1.1                 
##  [51] beeswarm_0.4.0                gtable_0.3.0                 
##  [53] phylobase_0.8.10              beachmat_2.8.1               
##  [55] rlang_0.4.11                  genefilter_1.74.0            
##  [57] splines_4.1.0                 lazyeval_0.2.2               
##  [59] BiocManager_1.30.16           yaml_2.2.1                   
##  [61] reshape2_1.4.4                backports_1.2.1              
##  [63] httpuv_1.6.2                  tools_4.1.0                  
##  [65] bookdown_0.23                 gridBase_0.4-7               
##  [67] ggplot2_3.3.5                 ellipsis_0.3.2               
##  [69] jquerylib_0.1.4               RColorBrewer_1.1-2           
##  [71] Rcpp_1.0.7                    plyr_1.8.6                   
##  [73] base64enc_0.1-3               sparseMatrixStats_1.4.2      
##  [75] progress_1.2.2                zlibbioc_1.38.0              
##  [77] purrr_0.3.4                   RCurl_1.98-1.4               
##  [79] prettyunits_1.1.1             viridis_0.6.1                
##  [81] cluster_2.1.2                 magrittr_2.0.1               
##  [83] magick_2.7.3                  data.table_1.14.0            
##  [85] mime_0.11                     hms_1.1.0                    
##  [87] evaluate_0.14                 xtable_1.8-4                 
##  [89] XML_3.99-0.7                  gridExtra_2.3                
##  [91] shape_1.4.6                   compiler_4.1.0               
##  [93] scater_1.20.1                 tibble_3.1.3                 
##  [95] RCy3_2.12.4                   crayon_1.4.1                 
##  [97] R.oo_1.24.0                   htmltools_0.5.1.1            
##  [99] entropy_1.3.0                 later_1.3.0                  
## [101] tidyr_1.1.3                   howmany_0.3-1                
## [103] DBI_1.1.1                     ExperimentHub_2.0.0          
## [105] dbplyr_2.1.1                  MASS_7.3-54                  
## [107] rappdirs_0.3.3                Matrix_1.3-4                 
## [109] ade4_1.7-17                   uchardet_1.1.0               
## [111] R.methodsS3_1.8.1             igraph_1.2.6                 
## [113] pkgconfig_2.0.3               bigmemory.sri_0.1.3          
## [115] rncl_0.8.4                    registry_0.5-1               
## [117] locfdr_1.1-8                  signal_0.7-7                 
## [119] IRdisplay_1.0                 scuttle_1.2.1                
## [121] xml2_1.3.2                    foreach_1.5.1                
## [123] annotate_1.70.0               vipor_0.4.5                  
## [125] bslib_0.2.5.1                 rngtools_1.5                 
## [127] pkgmaker_0.32.2               XVector_0.32.0               
## [129] stringr_1.4.0                 digest_0.6.27                
## [131] pracma_2.3.3                  graph_1.70.0                 
## [133] softImpute_1.4-1              Biostrings_2.60.2            
## [135] rmarkdown_2.10                edgeR_3.34.0                 
## [137] DelayedMatrixStats_1.14.2     curl_4.3.2                   
## [139] kernlab_0.9-29                shiny_1.6.0                  
## [141] lifecycle_1.0.0               nlme_3.1-152                 
## [143] jsonlite_1.7.2                clusterExperiment_2.12.0     
## [145] Rhdf5lib_1.14.2               BiocNeighbors_1.10.0         
## [147] viridisLite_0.4.0             limma_3.48.3                 
## [149] fansi_0.5.0                   pillar_1.6.2                 
## [151] lattice_0.20-44               KEGGREST_1.32.0              
## [153] fastmap_1.1.0                 httr_1.4.2                   
## [155] survival_3.2-12               interactiveDisplayBase_1.30.0
## [157] glue_1.4.2                    png_0.1-7                    
## [159] iterators_1.0.13              BiocVersion_3.13.1           
## [161] glmnet_4.1-2                  bit_4.0.4                    
## [163] stringi_1.7.3                 sass_0.4.0                   
## [165] HDF5Array_1.20.0              blob_1.2.2                   
## [167] AnnotationHub_3.0.1           BiocSingular_1.8.1           
## [169] memoise_2.0.0                 IRkernel_1.2                 
## [171] dplyr_1.0.7                   irlba_2.3.3                  
## [173] ape_5.5

Building binary classifier from clinical and ’omic data using pathway-level features

2021-08-19

Package