## ----include=FALSE, echo=FALSE, eval=TRUE------------------------------------- library(BiocParallel) if (Sys.info()['sysname'] == 'windows') { BiocParallel::register(BiocParallel::SerialParam()) } ## ----load_dependencies, message=FALSE, warning=FALSE, include=FALSE----------- library(PDATK) library(msigdbr) library(data.table) ## ----load_sample_data--------------------------------------------------------- data(sampleCohortList) sampleCohortList ## ----subset_and_split_data---------------------------------------------------- commonGenes <- findCommonGenes(sampleCohortList) # Subsets all list items, with subset for specifying rows and select for # specifying columns cohortList <- subset(sampleCohortList, subset=commonGenes) ICGCcohortList <- cohortList[grepl('ICGC', names(cohortList), ignore.case=TRUE)] validationCohortList <- cohortList[!grepl('icgc', names(cohortList), ignore.case=TRUE)] ## ----drop_not_censored_patients----------------------------------------------- validationCohortList <- dropNotCensored(validationCohortList) ICGCcohortList <- dropNotCensored(ICGCcohortList) ## ----split_train_test--------------------------------------------------------- # find common samples between our training cohorts in a cohort list commonSamples <- findCommonSamples(ICGCcohortList) # split into shared samples for training, the rest for testing ICGCtrainCohorts <- subset(ICGCcohortList, select=commonSamples) ICGCtestCohorts <- subset(ICGCcohortList, select=commonSamples, invert=TRUE) # merge our training cohort test data into the rest of the validation data validationCohortList <- c(ICGCtestCohorts, validationCohortList) # drop ICGCSEQ from the validation data, because it only has 7 patients validationCohortList <- validationCohortList[names(validationCohortList) != 'ICGCSEQ'] ## ----build_PCOSP_model, message=FALSE----------------------------------------- set.seed(1987) PCOSPmodel <- PCOSP(ICGCtrainCohorts, minDaysSurvived=365, randomSeed=1987) # view the model parameters; these make your model run reproducible metadata(PCOSPmodel)$modelParams ## ----messages=FALSE, warning=FALSE-------------------------------------------- trainedPCOSPmodel <- trainModel(PCOSPmodel, numModels=15, minAccuracy=0.6) metadata(trainedPCOSPmodel)$modelParams ## ----PCOSP_predictions, message=FALSE, warning=FALSE-------------------------- PCOSPpredValCohorts <- predictClasses(validationCohortList, model=trainedPCOSPmodel) ## ----predicted_elementMetadata------------------------------------------------ mcols(PCOSPpredValCohorts) ## ----risk_column-------------------------------------------------------------- knitr::kable(head(colData(PCOSPpredValCohorts[[1]]))) ## ----raw_predictions---------------------------------------------------------- knitr::kable(metadata(PCOSPpredValCohorts[[1]])$PCOSPpredictions[1:5, 1:5]) ## ----validate_PCOSP_model, message=FALSE, warning=FALSE----------------------- validatedPCOSPmodel <- validateModel(trainedPCOSPmodel, valData=PCOSPpredValCohorts) ## ----validationStats---------------------------------------------------------- knitr::kable(head(validationStats(validatedPCOSPmodel))) ## ----PCOSP_D_index_forestplot, fig.width=8, fig.height=8, fig.wide=TRUE------- PCOSPdIndexForestPlot <- forestPlot(validatedPCOSPmodel, stat='log_D_index') PCOSPdIndexForestPlot ## ----PCOSP_concordance_index_forestplot, fig.width=8, fig.height=8, fig.wide=TRUE---- PCOSPconcIndexForestPlot <- forestPlot(validatedPCOSPmodel, stat='concordance_index') PCOSPconcIndexForestPlot ## ----PCOSP_ROC_curve, fig.height=8, fig.width=8, fig.wide=TRUE, message=FALSE---- cohortROCplots <- plotROC(validatedPCOSPmodel, alpha=0.05) cohortROCplots ## ----RLSModeL_constructor----------------------------------------------------- # Merge to a single SurvivalExperiment ICGCtrainCohorts <- merge(ICGCtrainCohorts[[1]], ICGCtrainCohorts[[2]], cohortNames=names(ICGCtrainCohorts)) RLSmodel <- RLSModel(ICGCtrainCohorts, minDaysSurvived=365, randomSeed=1987) ## ----RLSModel_training-------------------------------------------------------- trainedRLSmodel <- trainModel(RLSmodel, numModels=15) ## ----RLSModel_predictions----------------------------------------------------- RLSpredCohortList <- predictClasses(validationCohortList, model=trainedRLSmodel) ## ----RLSModel_validation------------------------------------------------------ validatedRLSmodel <- validateModel(trainedRLSmodel, RLSpredCohortList) ## ----RLSModel_PCOSP_comparison_plot, fig.width=8, fig.height=8, fig.wide=TRUE---- RLSmodelComparisonPlot <- densityPlotModelComparison(validatedRLSmodel, validatedPCOSPmodel, title='Random Label Shuffling vs PCOSP', mDataTypeLabels=c(rna_seq='Sequencing-based', rna_micro='Array-based', combined='Overall')) RLSmodelComparisonPlot ## ----RGAModeL_constructor----------------------------------------------------- RGAmodel <- RGAModel(ICGCtrainCohorts, randomSeed=1987) ## ----RGAModel_training-------------------------------------------------------- trainedRGAmodel <- trainModel(RGAmodel, numModels=15) ## ----RGAModel_predictions----------------------------------------------------- RGApredCohortList <- predictClasses(validationCohortList, model=trainedRGAmodel) ## ----RGAModel_validation------------------------------------------------------ validatedRGAmodel <- validateModel(trainedRGAmodel, RGApredCohortList) ## ----RGAModel_PCOSP_comparison_plot, fig.width=8, fig.height=8, fig.wide=TRUE---- RGAmodelComparisonPlot <- densityPlotModelComparison(validatedRGAmodel, validatedPCOSPmodel, title='Random Gene Assignment vs PCOSP', mDataTypeLabels=c(rna_seq='Sequencing-based', rna_micro='Array-based', combined='Overall')) RGAmodelComparisonPlot ## ----PCOSP_get_top_features--------------------------------------------------- topFeatures <- getTopFeatures(validatedPCOSPmodel) topFeatures ## ----msigdbr_get_pathways, eval=FALSE----------------------------------------- # allHumanGeneSets <- msigdbr() # allGeneSets <- as.data.table(allHumanGeneSets) # geneSets <- allGeneSets[grepl('^GO.*|.*CANONICAL.*|^HALLMARK.*', gs_name), # .(gene_symbol, gs_name)] ## ----PCOSP_runGSEA, eval=FALSE------------------------------------------------ # GSEAresultDT <- runGSEA(validatedPCOSPmodel, geneSets) ## ----training_data_patient_metadata------------------------------------------- knitr::kable(head(colData(ICGCtrainCohorts))) ## ----ClinicalModel_constructor------------------------------------------------ clinicalModel <- ClinicalModel(ICGCtrainCohorts, formula='prognosis ~ sex + age + T + N + M + grade', randomSeed=1987) ## ----ClinicalModel_training--------------------------------------------------- trainedClinicalModel <- trainModel(clinicalModel) ## ----ClinicalModel_prediction------------------------------------------------- hasModelParamsCohortList <- PCOSPpredValCohorts[c('ICGCMICRO', 'TCGA', 'PCSI', 'OUH')] clinicalPredCohortList <- predictClasses(hasModelParamsCohortList, model=trainedClinicalModel) ## ----ClinicalModel_predictions------------------------------------------------ validatedClinicalModel <- validateModel(trainedClinicalModel, clinicalPredCohortList) ## ----ClinicalModel_vs_PCOSP_AUC_barplot, fig.wide=TRUE, fig.width=8, fig.height=8---- clinicalVsPCOSPbarPlot <- barPlotModelComparison(validatedClinicalModel, validatedPCOSPmodel, stat='AUC') clinicalVsPCOSPbarPlot ## ----ModelComparison_object--------------------------------------------------- clinicalVsPCOSP <- compareModels(validatedClinicalModel, validatedPCOSPmodel) ## ----ClinicalModel_vs_PCOSP_dIndex, fig.wide=TRUE, fig.height=8, fig.width=8---- clinVsPCOSPdIndexForestPlot <- forestPlot(clinicalVsPCOSP, stat='log_D_index') clinVsPCOSPdIndexForestPlot ## ----ClinicalModel_vs_PCOSP_concIndex, fig.wide=TRUE, fig.height=8, fig.width=8---- clinVsPCOSPconcIndexForestPlot <- forestPlot(clinicalVsPCOSP, stat='concordance_index') clinVsPCOSPconcIndexForestPlot ## ----GeneFuModel_constructor-------------------------------------------------- chenGeneFuModel <- GeneFuModel(randomSeed=1987) birnbaumGeneFuModel <- GeneFuModel(randomSeed=1987) haiderGeneFuModel <- GeneFuModel(randomSeed=1987) ## ----GeneFuModel_assign_models------------------------------------------------ data(existingClassifierData) models(chenGeneFuModel) <- SimpleList(list(chen=chen)) models(birnbaumGeneFuModel) <- SimpleList(list(birnbuam=birnbaum)) models(haiderGeneFuModel) <- SimpleList(list(haider=NA)) ## ----GeneFuModel_predictions-------------------------------------------------- chenClassPredictions <- predictClasses(PCOSPpredValCohorts[names(haiderSigScores)], model=chenGeneFuModel) birnClassPredictions <- predictClasses(PCOSPpredValCohorts[names(haiderSigScores)], model=birnbaumGeneFuModel) ## ----GeneFuModel_haider_fix--------------------------------------------------- haiderClassPredictions <- PCOSPpredValCohorts[names(haiderSigScores)] # Manually assign the scores to the prediction cohorts for (i in seq_along(haiderClassPredictions)) { colMData <- colData(haiderClassPredictions[[i]]) colMData$genefu_score <- NA_real_ colMData[rownames(colMData) %in% names(haiderSigScores[[i]]), ]$genefu_score <- haiderSigScores[[i]][names(haiderSigScores[[i]]) %in% rownames(colMData)] colData(haiderClassPredictions[[i]]) <- colMData } # Setup the correct model metadata mcols(haiderClassPredictions)$hasPredictions <- TRUE metadata(haiderClassPredictions)$predictionModel <- haiderGeneFuModel ## ----------------------------------------------------------------------------- validatedChenModel <- validateModel(chenGeneFuModel, valData=chenClassPredictions) validatedBirnbaumModel <- validateModel(birnbaumGeneFuModel, valData=birnClassPredictions) validatedHaiderModel <- validateModel(haiderGeneFuModel, valData=haiderClassPredictions) ## ----comparing_GeneFuModels--------------------------------------------------- genefuModelComparisons <- compareModels(validatedChenModel, validatedBirnbaumModel, modelNames=c('Chen', 'Birnbaum')) genefuModelComparisons <- compareModels(genefuModelComparisons, validatedHaiderModel, model2Name='Haider') ## ----comparing_comparisons---------------------------------------------------- allModelComparisons <- compareModels(genefuModelComparisons, validatedPCOSPmodel, model2Name='PCOSP') # We are only interested in comparing the summaries, so we subset our model comparison allModelComparisons <- subset(allModelComparisons, isSummary == TRUE) ## ----plotting Model_Comparisons_dindex, fig.width=8, fig.height=8, fig.wide=TRUE---- allDindexComparisonForestPlot <- forestPlot(allModelComparisons, stat='log_D_index', colourBy='model', groupBy='mDataType') allDindexComparisonForestPlot ## ----plotting Model Comparisons_concindex, fig.width=8, fig.height=8, fig.wide=TRUE---- allConcIndexComparisonForestPlot <- forestPlot(allModelComparisons, stat='concordance_index', colourBy='model', groupBy='mDataType') allConcIndexComparisonForestPlot ## ----adding_subtypes_to_CohortList-------------------------------------------- data(cohortSubtypeDFs) # Add the subtypes to the prediction cohort subtypedPCOSPValCohorts <- assignSubtypes(PCOSPpredValCohorts, cohortSubtypeDFs) ## ----validateModel_with_subtypes---------------------------------------------- subtypeValidatedPCOSPmodel <- validateModel(trainedPCOSPmodel, valData=subtypedPCOSPValCohorts) ## ----forestPlot_Dindex_subtyped_PCOSP_model, fig.width=8, fig.height=8, fig.wide=TRUE---- forestPlot(subtypeValidatedPCOSPmodel, stat='log_D_index', groupBy='cohort', colourBy='subtype') ## ----forestPlot_Cindex_subtyped_PCOSP_model, fig.width=8, fig.height=8, fig.wide=TRUE---- forestPlot(subtypeValidatedPCOSPmodel, stat='concordance_index', groupBy='cohort', colourBy='subtype')