## ----style, echo=FALSE, results='hide', message=FALSE------------------------- library(BiocStyle) library(knitr) opts_chunk$set(error = FALSE, message = FALSE, warning = FALSE) opts_chunk$set(fig.asp = 1) ## ----installation, echo=TRUE, eval=FALSE-------------------------------------- # ## try http:// if https:// URLs are not supported # if (!requireNamespace("BiocManager", quietly=TRUE)) # install.packages("BiocManager") # BiocManager::install("Melissa") # # ## Or download from Github repository # # install.packages("devtools") # devtools::install_github("andreaskapou/Melissa", build_vignettes = TRUE) ## ----melissa, fig.retina = NULL, fig.align='center', fig.cap="`Melissa` model overview. Melissa combines a likelihood computed from single cell methylation profiles fitted to each genomic region using a supervised regression approach (bottom left) and an unsupervised Bayesian clustering prior (top left). The posterior distribution provides a methylome-based clustering (top right) and imputation (bottom right) of single cells.", echo=FALSE---- knitr::include_graphics("../inst/figures/melissa.png") ## ----load_synth_data---------------------------------------------------------- suppressPackageStartupMessages(library(Melissa)) # Load package dt_obj <- melissa_encode_dt # Load synthetic data ## ---- eval=FALSE, echo=FALSE, include=FALSE----------------------------------- # # For efficiency keep only the first 50 genomic regions # dt_obj$met <- dt_obj$met[1:50] # dt_obj$opts$C_true <- dt_obj$opts$C_true[1:50,] # #dt_obj$met <- lapply(dt_obj$met, function(x) x[1:50]) ## ----------------------------------------------------------------------------- # Elements of `dt_obj` object names(dt_obj) ## ----------------------------------------------------------------------------- head(dt_obj$met[[2]][[50]]) ## ----------------------------------------------------------------------------- # Number of cells cat("Number of cells: ", length(dt_obj$met)) ## ----------------------------------------------------------------------------- # Number of genomic regions in each cell cat("Number of genomic regions: ", length(dt_obj$met[[1]]) ) ## ----create_basis------------------------------------------------------------- library(BPRMeth) # Create RBF basis object with 4 RBFs basis_obj <- create_rbf_object(M = 4) ## ----show_basis--------------------------------------------------------------- # Show the slots of the 'rbf' object basis_obj ## ----partition_data----------------------------------------------------------- set.seed(15) # Partition to training and test set dt_obj <- partition_dataset(dt_obj = dt_obj, data_train_prcg = 0.2, region_train_prcg = 1, cpg_train_prcg = 0.4, is_synth = TRUE) ## ----run_melissa-------------------------------------------------------------- set.seed(15) # Run Melissa with K = 4 clusters melissa_obj <- melissa(X = dt_obj$met, K = 4, basis = basis_obj, vb_max_iter = 20, vb_init_nstart = 1, is_parallel = FALSE) ## ----summary_mixing_proportions----------------------------------------------- melissa_obj$pi_k ## ----summary_responsibilities------------------------------------------------- head(melissa_obj$r_nk) ## ----summary_weights---------------------------------------------------------- melissa_obj$W[10, , 2] ## ----plot_profiles_1, fig.wide=TRUE------------------------------------------- # Plot profiles from all cell subtypes for genomic region 25 plot_melissa_profiles(melissa_obj = melissa_obj, region = 25, title = "Methylation profiles for region 25") ## ----plot_profiles_2, fig.wide=TRUE------------------------------------------- # Plot profiles from all cell subtypes for genomic region 90 plot_melissa_profiles(melissa_obj = melissa_obj, region = 90, title = "Methylation profiles for region 90") ## ----evaluate_cluster_perf---------------------------------------------------- # Run clustering performance melissa_obj <- eval_cluster_performance(melissa_obj, dt_obj$opts$C_true) ## ----ari_measure-------------------------------------------------------------- # ARI metric cat("ARI: ", melissa_obj$clustering$ari) ## ----cluster_assignment_error------------------------------------------------- # Clustering assignment error metric cat("Clustering assignment error: ", melissa_obj$clustering$error) ## ----perfrom_imputation------------------------------------------------------- imputation_obj <- impute_test_met(obj = melissa_obj, test = dt_obj$met_test) ## ----evaluate_imputation------------------------------------------------------ melissa_obj <- eval_imputation_performance(obj = melissa_obj, imputation_obj=imputation_obj) ## ----auc---------------------------------------------------------------------- # AUC cat("AUC: ", melissa_obj$imputation$auc) ## ----f_measure---------------------------------------------------------------- # F-measure cat("F-measure: ", melissa_obj$imputation$f_measure) ## ----real_data, eval=FALSE---------------------------------------------------- # # Observed binarised met file format: # # chr pos met_state # # 1 11 0 # # X 15 1 # # # To impute met file format: # # chr pos met_state # # 1 20 -1 # # X 25 -1 # # X 44 0 # # # Imputed met file format: # # chr pos met_state predicted # # 1 20 -1 0.74 # # X 25 -1 0.1 # # X 44 0 0.05 # # # Directory of files (cells) with binarised methylation data. Each row # # contains CpGs that we have coverage and will be used to infer # # methylation profiles with Melissa. # met_dir <- "directory_of_observed_met_data" # # # Directory of files (cells, filenames and their structure should # # match with those in met_dir) to make predictions. Each row corresponds # # to a CpG that we will impute its value. It can contain both CpGs # # with and without CpG coverage. Those CpGs without coverage the third # # column can be any value, e.g. -1. Melissa will create a new folder # # with the same filenames, but including a new column named , # # corresponding the imputed value. # impute_met_dir <- "directory_of_cells_to_impute_CpGs" # # # Annotation file name for creating genomic regions of interest # anno_file <- "name_of_anno_file" # # # Create melissa data object # melissa_data <- create_melissa_data_obj(met_dir, anno_file) # # # QC and filtering regions # # By CpG coverage # melissa_data <- filter_by_cpg_coverage(melissa_data, min_cpgcov = 5) # # By methylation variability # melissa_data <- filter_by_variability(melissa_data, min_var = 0.1) # # By genomic coverage across cells # melissa_data <- filter_by_coverage_across_cells(melissa_data, # min_cell_cov_prcg = 0.3) # # # Run Melissa # melissa_obj <- melissa(X = melissa_data$met, K = 2) # # # Extract annotation region object # anno_region <- melissa_data$anno_region # # # Peform imputation. This function will create a new folder with # # imputed met values for each cell. Note that the new files will only # # contain imputed values for CpGs that fall in `anno_region`, since # # Melissa can predict values only within these regions. # out <- impute_met_files(met_dir = impute_met_dir, obj = melissa_obj, # anno_region = anno_region) ## ----session_info, echo=TRUE, message=FALSE----------------------------------- sessionInfo()