## ----style, echo=FALSE, results='hide', message=FALSE-------------------------
library(BiocStyle)
library(knitr)
opts_chunk$set(error = FALSE, message = FALSE, warning = FALSE)
opts_chunk$set(fig.asp = 1)

## ----installation, echo=TRUE, eval=FALSE--------------------------------------
#  ## try http:// if https:// URLs are not supported
#  if (!requireNamespace("BiocManager", quietly=TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("Melissa")
#  
#  ## Or download from Github repository
#  # install.packages("devtools")
#  devtools::install_github("andreaskapou/Melissa", build_vignettes = TRUE)

## ----melissa, fig.retina = NULL, fig.align='center', fig.cap="`Melissa` model overview. Melissa combines a likelihood computed from single cell methylation profiles fitted to each genomic region using a supervised regression approach (bottom left) and an unsupervised Bayesian clustering prior (top left). The posterior distribution provides a methylome-based clustering (top right) and imputation (bottom right) of single cells.", echo=FALSE----
knitr::include_graphics("../inst/figures/melissa.png")

## ----load_synth_data----------------------------------------------------------
suppressPackageStartupMessages(library(Melissa)) # Load package
dt_obj <- melissa_encode_dt   # Load synthetic data

## ---- eval=FALSE, echo=FALSE, include=FALSE-----------------------------------
#  # For efficiency keep only the first 50 genomic regions
#  dt_obj$met <- dt_obj$met[1:50]
#  dt_obj$opts$C_true <- dt_obj$opts$C_true[1:50,]
#  #dt_obj$met <- lapply(dt_obj$met, function(x) x[1:50])

## -----------------------------------------------------------------------------
# Elements of `dt_obj` object
names(dt_obj)

## -----------------------------------------------------------------------------
head(dt_obj$met[[2]][[50]])

## -----------------------------------------------------------------------------
# Number of cells
cat("Number of cells: ", length(dt_obj$met))

## -----------------------------------------------------------------------------
# Number of genomic regions in each cell
cat("Number of genomic regions: ", length(dt_obj$met[[1]]) )

## ----create_basis-------------------------------------------------------------
library(BPRMeth)
# Create RBF basis object with 4 RBFs
basis_obj <- create_rbf_object(M = 4)

## ----show_basis---------------------------------------------------------------
# Show the slots of the 'rbf' object
basis_obj

## ----partition_data-----------------------------------------------------------
set.seed(15)
# Partition to training and test set
dt_obj <- partition_dataset(dt_obj = dt_obj, data_train_prcg = 0.2,
                            region_train_prcg = 1, cpg_train_prcg = 0.4, 
                            is_synth = TRUE)

## ----run_melissa--------------------------------------------------------------
set.seed(15)
# Run Melissa with K = 4 clusters
melissa_obj <- melissa(X = dt_obj$met, K = 4, basis = basis_obj,
                       vb_max_iter = 20, vb_init_nstart = 1, 
                       is_parallel = FALSE)

## ----summary_mixing_proportions-----------------------------------------------
melissa_obj$pi_k

## ----summary_responsibilities-------------------------------------------------
head(melissa_obj$r_nk)

## ----summary_weights----------------------------------------------------------
melissa_obj$W[10, , 2]

## ----plot_profiles_1, fig.wide=TRUE-------------------------------------------
# Plot profiles from all cell subtypes for genomic region 25
plot_melissa_profiles(melissa_obj = melissa_obj, region = 25, 
                      title = "Methylation profiles for region 25")

## ----plot_profiles_2, fig.wide=TRUE-------------------------------------------
# Plot profiles from all cell subtypes for genomic region 90
plot_melissa_profiles(melissa_obj = melissa_obj, region = 90, 
                      title = "Methylation profiles for region 90")

## ----evaluate_cluster_perf----------------------------------------------------
# Run clustering performance
melissa_obj <- eval_cluster_performance(melissa_obj, dt_obj$opts$C_true)

## ----ari_measure--------------------------------------------------------------
# ARI metric
cat("ARI: ", melissa_obj$clustering$ari)

## ----cluster_assignment_error-------------------------------------------------
# Clustering assignment error metric
cat("Clustering assignment error: ", melissa_obj$clustering$error)

## ----perfrom_imputation-------------------------------------------------------
imputation_obj <- impute_test_met(obj = melissa_obj, 
                                  test = dt_obj$met_test)

## ----evaluate_imputation------------------------------------------------------
melissa_obj <- eval_imputation_performance(obj = melissa_obj, 
                                           imputation_obj=imputation_obj)

## ----auc----------------------------------------------------------------------
# AUC 
cat("AUC: ", melissa_obj$imputation$auc)

## ----f_measure----------------------------------------------------------------
# F-measure
cat("F-measure: ", melissa_obj$imputation$f_measure)

## ----real_data, eval=FALSE----------------------------------------------------
#  # Observed binarised met file format:
#  # chr pos met_state
#  # 1   11  0
#  # X   15  1
#  
#  # To impute met file format:
#  # chr pos met_state
#  # 1   20  -1
#  # X   25  -1
#  # X   44  0
#  
#  # Imputed met file format:
#  # chr pos met_state predicted
#  # 1   20    -1      0.74
#  # X   25    -1      0.1
#  # X   44    0       0.05
#  
#  # Directory of files (cells) with binarised methylation data. Each row
#  # contains CpGs that we have coverage and will be used to infer
#  # methylation profiles with Melissa.
#  met_dir <- "directory_of_observed_met_data"
#  
#  # Directory of files (cells, filenames and their structure should
#  # match with those in met_dir) to make predictions. Each row corresponds
#  # to a CpG that we will impute its value. It can contain both CpGs
#  # with and without CpG coverage. Those CpGs without coverage the third
#  # column can be any value, e.g. -1. Melissa will create a new folder
#  # with the same filenames, but including a new column named <predicted>,
#  # corresponding the imputed value.
#  impute_met_dir <- "directory_of_cells_to_impute_CpGs"
#  
#  # Annotation file name for creating genomic regions of interest
#  anno_file <- "name_of_anno_file"
#  
#  # Create melissa data object
#  melissa_data <- create_melissa_data_obj(met_dir, anno_file)
#  
#  # QC and filtering regions
#  # By CpG coverage
#  melissa_data <- filter_by_cpg_coverage(melissa_data, min_cpgcov = 5)
#  # By methylation variability
#  melissa_data <- filter_by_variability(melissa_data, min_var = 0.1)
#  # By genomic coverage across cells
#  melissa_data <- filter_by_coverage_across_cells(melissa_data,
#                                                  min_cell_cov_prcg = 0.3)
#  
#  # Run Melissa
#  melissa_obj <- melissa(X = melissa_data$met, K = 2)
#  
#  # Extract annotation region object
#  anno_region <- melissa_data$anno_region
#  
#  # Peform imputation. This function will create a new folder with
#  # imputed met values for each cell. Note that the new files will only
#  # contain imputed values for CpGs that fall in `anno_region`, since
#  # Melissa can predict values only within these regions.
#  out <- impute_met_files(met_dir = impute_met_dir, obj = melissa_obj,
#                          anno_region = anno_region)

## ----session_info, echo=TRUE, message=FALSE-----------------------------------
sessionInfo()