Predicting MCIA global (factor) scores for new test samples

It may be of interest to use the embedding that is calculated on a training sample set to predict scores on a test set (or, equivalently, on new data).

After loading the nipalsMCIA library, we randomly split the NCI60 cancer cell line data into training and test sets.

Installation

# install.packages("devtools")
devtools::install_github("Muunraker/nipalsMCIA", ref = "code-development",
                         force = TRUE, build_vignettes = TRUE) # devel version
# after acceptance
# install.packages("BiocManager")
BiocManager::install("nipalsMCIA")
library(ggplot2)
library(MultiAssayExperiment)
library(nipalsMCIA)

Split the data

data(NCI60)
set.seed(8)

num_samples <- dim(data_blocks[[1]])[1]
num_train <- round(num_samples * 0.7, 0)
train_samples <- sample.int(num_samples, num_train)

data_blocks_train <- data_blocks
data_blocks_test <- data_blocks

for (i in seq_along(data_blocks)) {
  data_blocks_train[[i]] <- data_blocks_train[[i]][train_samples, ]
  data_blocks_test[[i]] <- data_blocks_test[[i]][-train_samples, ]
}

# Split corresponding metadata
metadata_train <- data.frame(metadata_NCI60[train_samples, ],
                             row.names = rownames(data_blocks_train$mrna))
colnames(metadata_train) <- c("cancerType")

metadata_test <- data.frame(metadata_NCI60[-train_samples, ],
                            row.names = rownames(data_blocks_test$mrna))
colnames(metadata_test) <- c("cancerType")

# Create train and test mae objects
data_blocks_train_mae <- simple_mae(data_blocks_train, row_format = "sample",
                                    colData = metadata_train)
data_blocks_test_mae <- simple_mae(data_blocks_test, row_format = "sample",
                                   colData = metadata_test)

Run nipalsMCIA on training data

MCIA_train <- nipals_multiblock(data_blocks = data_blocks_train_mae,
                                col_preproc_method = "colprofile", num_PCs = 10,
                                plots = "none", tol = 1e-9)

Visualize model on training data using metadata on cancer type

The get_metadata_colors() function returns an assignment of a color for the metadata columns. The nmb_get_gs() function returns the global scores from the input NipalsResult object.

meta_colors <- get_metadata_colors(mcia_results = MCIA_train, color_col = 1,
                                   color_pal_params = list(option = "E"))

global_scores <- nmb_get_gs(MCIA_train)
MCIA_out <- data.frame(global_scores[, 1:2])
MCIA_out$cancerType <- nmb_get_metadata(MCIA_train)$cancerType
colnames(MCIA_out) <- c("Factor.1", "Factor.2", "cancerType")

# plot the results
ggplot(data = MCIA_out, aes(x = Factor.1, y = Factor.2, color = cancerType)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()

Generate factor scores for test data using the MCIA_train model

We use the function to generate new factor scores on the test data set using the MCIA_train model. The new dataset in the form of an MAE object is input using the parameter test_data.

MCIA_test_scores <- predict_gs(mcia_results = MCIA_train,
                               test_data = data_blocks_test_mae)

Visualize new scores with old

We once again plot the top two factor scores for both the training and test datasets

MCIA_out_test <- data.frame(MCIA_test_scores[, 1:2])
MCIA_out_test$cancerType <- MultiAssayExperiment::colData(data_blocks_test_mae)$cancerType

colnames(MCIA_out_test) <- c("Factor.1", "Factor.2", "cancerType")
MCIA_out_test$set <- "test"
MCIA_out$set <- "train"
MCIA_out_full <- rbind(MCIA_out, MCIA_out_test)
rownames(MCIA_out_full) <- NULL

# plot the results
ggplot(data = MCIA_out_full,
       aes(x = Factor.1, y = Factor.2, color = cancerType, shape = set)) +
  geom_point(size = 3) +
  labs(title = "MCIA for NCI60 training and test data") +
  scale_color_manual(values = meta_colors) +
  theme_bw()

Session Info

Session Info

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.18-bioc/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] MultiAssayExperiment_1.28.0 SummarizedExperiment_1.32.0
##  [3] Biobase_2.62.0              GenomicRanges_1.54.0       
##  [5] GenomeInfoDb_1.38.0         BiocGenerics_0.48.0        
##  [7] MatrixGenerics_1.14.0       matrixStats_1.0.0          
##  [9] SeuratObject_4.1.4          Seurat_4.4.0               
## [11] piggyback_0.1.5             IRanges_2.36.0             
## [13] S4Vectors_0.40.0            stringr_1.5.0              
## [15] nipalsMCIA_1.0.0            ggpubr_0.6.0               
## [17] ggplot2_3.4.4               fgsea_1.28.0               
## [19] dplyr_1.1.3                 ComplexHeatmap_2.18.0      
## [21] BiocStyle_2.30.0           
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.21        splines_4.3.1           later_1.3.1            
##   [4] bitops_1.0-7            tibble_3.2.1            polyclip_1.10-6        
##   [7] httr2_0.2.3             lifecycle_1.0.3         rstatix_0.7.2          
##  [10] doParallel_1.0.17       globals_0.16.2          lattice_0.22-5         
##  [13] MASS_7.3-60             backports_1.4.1         magrittr_2.0.3         
##  [16] plotly_4.10.3           sass_0.4.7              rmarkdown_2.25         
##  [19] jquerylib_0.1.4         yaml_2.3.7              httpuv_1.6.12          
##  [22] sctransform_0.4.1       sp_2.1-1                spatstat.sparse_3.0-3  
##  [25] reticulate_1.34.0       cowplot_1.1.1           pbapply_1.7-2          
##  [28] RColorBrewer_1.1-3      lubridate_1.9.3         abind_1.4-5            
##  [31] zlibbioc_1.48.0         Rtsne_0.16              purrr_1.0.2            
##  [34] RCurl_1.98-1.12         pracma_2.4.2            rappdirs_0.3.3         
##  [37] circlize_0.4.15         GenomeInfoDbData_1.2.11 ggrepel_0.9.4          
##  [40] irlba_2.3.5.1           gitcreds_0.1.2          listenv_0.9.0          
##  [43] spatstat.utils_3.0-4    goftest_1.2-3           RSpectra_0.16-1        
##  [46] spatstat.random_3.2-1   fitdistrplus_1.1-11     parallelly_1.36.0      
##  [49] leiden_0.4.3            codetools_0.2-19        DelayedArray_0.28.0    
##  [52] tidyselect_1.2.0        shape_1.4.6             farver_2.1.1           
##  [55] spatstat.explore_3.2-5  jsonlite_1.8.7          GetoptLong_1.0.5       
##  [58] ellipsis_0.3.2          progressr_0.14.0        ggridges_0.5.4         
##  [61] survival_3.5-7          iterators_1.0.14        foreach_1.5.2          
##  [64] tools_4.3.1             ica_1.0-3               Rcpp_1.0.11            
##  [67] glue_1.6.2              gridExtra_2.3           SparseArray_1.2.0      
##  [70] BiocBaseUtils_1.4.0     xfun_0.40               withr_2.5.1            
##  [73] BiocManager_1.30.22     fastmap_1.1.1           fansi_1.0.5            
##  [76] digest_0.6.33           timechange_0.2.0        R6_2.5.1               
##  [79] mime_0.12               colorspace_2.1-0        scattermore_1.2        
##  [82] Cairo_1.6-1             tensor_1.5              spatstat.data_3.0-3    
##  [85] utf8_1.2.4              tidyr_1.3.0             generics_0.1.3         
##  [88] data.table_1.14.8       httr_1.4.7              htmlwidgets_1.6.2      
##  [91] S4Arrays_1.2.0          uwot_0.1.16             pkgconfig_2.0.3        
##  [94] gtable_0.3.4            lmtest_0.9-40           XVector_0.42.0         
##  [97] htmltools_0.5.6.1       carData_3.0-5           bookdown_0.36          
## [100] clue_0.3-65             scales_1.2.1            png_0.1-8              
## [103] knitr_1.44              reshape2_1.4.4          rjson_0.2.21           
## [106] curl_5.1.0              nlme_3.1-163            cachem_1.0.8           
## [109] zoo_1.8-12              GlobalOptions_0.1.2     KernSmooth_2.23-22     
## [112] vipor_0.4.5             parallel_4.3.1          miniUI_0.1.1.1         
## [115] ggrastr_1.0.2           pillar_1.9.0            vctrs_0.6.4            
## [118] RANN_2.6.1              promises_1.2.1          car_3.1-2              
## [121] xtable_1.8-4            cluster_2.1.4           beeswarm_0.4.0         
## [124] evaluate_0.22           magick_2.8.1            cli_3.6.1              
## [127] compiler_4.3.1          rlang_1.1.1             crayon_1.5.2           
## [130] future.apply_1.11.0     ggsignif_0.6.4          labeling_0.4.3         
## [133] ggbeeswarm_0.7.2        fs_1.6.3                plyr_1.8.9             
## [136] stringi_1.7.12          deldir_1.0-9            viridisLite_0.4.2      
## [139] BiocParallel_1.36.0     munsell_0.5.0           gh_1.4.0               
## [142] lazyeval_0.2.2          spatstat.geom_3.2-7     Matrix_1.6-1.1         
## [145] patchwork_1.1.3         future_1.33.0           shiny_1.7.5.1          
## [148] ROCR_1.0-11             igraph_1.5.1            broom_1.0.5            
## [151] memoise_2.0.1           bslib_0.5.1             fastmatch_1.1-4