## ----setup, eval = TRUE, echo = FALSE-----------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

## ----installation, eval = FALSE-----------------------------------------------
#  # To install scGPS from github (Depending on the configuration of the local
#  # computer or HPC, possible custom C++ compilation may be required - see
#  # installation trouble-shootings below)
#  devtools::install_github("IMB-Computational-Genomics-Lab/scGPS")
#  
#  # for C++ compilation trouble-shooting, manual download and installation can be
#  # done from github
#  
#  git clone https://github.com/IMB-Computational-Genomics-Lab/scGPS
#  
#  # then check in scGPS/src if any of the precompiled (e.g.  those with *.so and
#  # *.o) files exist and delete them before recompiling
#  
#  # then with the scGPS as the R working directory, manually install and load
#  # using devtools functionality
#  # Install the package
#  devtools::install()
#  #load the package to the workspace
#  library(scGPS)
#  

## ----scGPS_object, warning = FALSE, message = FALSE---------------------------

# load mixed population 1 (loaded from day_2_cardio_cell_sample dataset, 
# named it as day2)
library(scGPS)

day2 <- day_2_cardio_cell_sample
mixedpop1 <- new_scGPS_object(ExpressionMatrix = day2$dat2_counts,
    GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters)

# load mixed population 2 (loaded from day_5_cardio_cell_sample dataset, 
# named it as day5)
day5 <- day_5_cardio_cell_sample
mixedpop2 <- new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
    GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters)


## ----prediction, warning = FALSE, message = FALSE-----------------------------

# select a subpopulation
c_selectID <- 1
# load gene list (this can be any lists of user selected genes)
genes <- training_gene_sample
genes <- genes$Merged_unique
# load cluster information 
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
#run training (running nboots = 3 here, but recommend to use nboots = 50-100)
LSOLDA_dat <- bootstrap_prediction(nboots = 3, mixedpop1 = mixedpop1, 
    mixedpop2 = mixedpop2, genes = genes, c_selectID  = c_selectID,
    listData = list(), cluster_mixedpop1 = cluster_mixedpop1, 
    cluster_mixedpop2 = cluster_mixedpop2, trainset_ratio = 0.7)
names(LSOLDA_dat)

## ----summarise_results, warning = FALSE, message = FALSE----------------------
# summary results LDA
sum_pred_lda <- summary_prediction_lda(LSOLDA_dat = LSOLDA_dat, nPredSubpop = 4)
# summary results Lasso to show the percent of cells
# classified as cells belonging 
sum_pred_lasso <- summary_prediction_lasso(LSOLDA_dat = LSOLDA_dat,
    nPredSubpop = 4)
# plot summary results 
plot_sum <-function(sum_dat){
    sum_dat_tf <- t(sum_dat)
    sum_dat_tf <- na.omit(sum_dat_tf)
    sum_dat_tf <- apply(sum_dat[, -ncol(sum_dat)],1,
        function(x){as.numeric(as.vector(x))})
    sum_dat$names <- gsub("ElasticNet for subpop","sp",  sum_dat$names )
    sum_dat$names <- gsub("in target mixedpop","in p",  sum_dat$names) 
    sum_dat$names <- gsub("LDA for subpop","sp",  sum_dat$names )
    sum_dat$names <- gsub("in target mixedpop","in p",  sum_dat$names)
    colnames(sum_dat_tf) <- sum_dat$names
    boxplot(sum_dat_tf, las=2)
}
plot_sum(sum_pred_lasso)
plot_sum(sum_pred_lda)
# summary accuracy to check the model accuracy in the leave-out test set 
summary_accuracy(object = LSOLDA_dat)
# summary maximum deviance explained by the model 
summary_deviance(object = LSOLDA_dat)

## ----CORE, warning = FALSE, message = FALSE-----------------------------------

# find clustering information in an expresion data using CORE
day5 <- day_5_cardio_cell_sample
cellnames <- colnames(day5$dat5_counts)
cluster <-day5$dat5_clusters
cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames)
mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
                    GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames)

CORE_cluster <- CORE_clustering(mixedpop2, remove_outlier = c(0), PCA=FALSE)

# to update the clustering information, users can ...
key_height <- CORE_cluster$optimalClust$KeyStats$Height
optimal_res <- CORE_cluster$optimalClust$OptimalRes
optimal_index = which(key_height == optimal_res)

clustering_after_outlier_removal <- unname(unlist(
 CORE_cluster$Cluster[[optimal_index]]))
corresponding_cells_after_outlier_removal <- CORE_cluster$cellsForClustering
original_cells_before_removal <- colData(mixedpop2)[,2]
corresponding_index <- match(corresponding_cells_after_outlier_removal,
                            original_cells_before_removal )
# check the matching
identical(as.character(original_cells_before_removal[corresponding_index]),
         corresponding_cells_after_outlier_removal)
# create new object with the new clustering after removing outliers
mixedpop2_post_clustering <- mixedpop2[,corresponding_index]
colData(mixedpop2_post_clustering)[,1] <- clustering_after_outlier_removal


## ----SCORE with bagging, warning = FALSE, message = FALSE---------------------

# find clustering information in an expresion data using SCORE
day5 <- day_5_cardio_cell_sample
cellnames <- colnames(day5$dat5_counts)
cluster <-day5$dat5_clusters
cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames)
mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts,
                    GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames )

SCORE_test <- CORE_bagging(mixedpop2, remove_outlier = c(0), PCA=FALSE,
                                bagging_run = 20, subsample_proportion = .8)


## ----visualisation, dev='CairoPDF'--------------------------------------------
dev.off()
##3.2.1 plot CORE clustering
p1 <- plot_CORE(CORE_cluster$tree, CORE_cluster$Cluster, 
    color_branch = c("#208eb7", "#6ce9d3", "#1c5e39", "#8fca40", "#154975",
        "#b1c8eb"))
p1
#extract optimal index identified by CORE
key_height <- CORE_cluster$optimalClust$KeyStats$Height
optimal_res <- CORE_cluster$optimalClust$OptimalRes
optimal_index = which(key_height == optimal_res)
#plot one optimal clustering bar
plot_optimal_CORE(original_tree= CORE_cluster$tree,
                 optimal_cluster = unlist(CORE_cluster$Cluster[optimal_index]),
                 shift = -2000)

##3.2.2 plot SCORE clustering
#plot all clustering bars
plot_CORE(SCORE_test$tree, list_clusters = SCORE_test$Cluster)
#plot one stable optimal clustering bar
plot_optimal_CORE(original_tree= SCORE_test$tree,
                 optimal_cluster = unlist(SCORE_test$Cluster[
                    SCORE_test$optimal_index]),
                 shift = -100)


## ----compare_clustering, fig.width=5, fig.height=5----------------------------
t <- tSNE(expression.mat=assay(mixedpop2))
p2 <-plot_reduced(t, color_fac = factor(colData(mixedpop2)[,1]),
                      palletes =1:length(unique(colData(mixedpop2)[,1])))
p2

## ----find_markers, warning = FALSE, message = FALSE, fig.width=7--------------
#load gene list (this can be any lists of user-selected genes)
genes <-training_gene_sample
genes <-genes$Merged_unique

#the gene list can also be objectively identified by differential expression
#analysis cluster information is requied for find_markers. Here, we use
#CORE results.

#colData(mixedpop2)[,1] <- unlist(SCORE_test$Cluster[SCORE_test$optimal_index])

suppressMessages(library(locfit))

DEgenes <- find_markers(expression_matrix=assay(mixedpop2),
                            cluster = colData(mixedpop2)[,1],
                            selected_cluster=unique(colData(mixedpop2)[,1]))

#the output contains dataframes for each cluster.
#the data frame contains all genes, sorted by p-values
names(DEgenes)

#you can annotate the identified clusters
DEgeneList_1vsOthers <- DEgenes$DE_Subpop1vsRemaining$id

#users need to check the format of the gene input to make sure they are
#consistent to the gene names in the expression matrix

#the following command saves the file "PathwayEnrichment.xlsx" to the
#working dir
#use 500 top DE genes
suppressMessages(library(DOSE))
suppressMessages(library(ReactomePA))
suppressMessages(library(clusterProfiler))
genes500 <- as.factor(DEgeneList_1vsOthers[seq_len(500)])
enrichment_test <- annotate_clusters(genes, pvalueCutoff=0.05, gene_symbol=TRUE)

#the enrichment outputs can be displayed by running
clusterProfiler::dotplot(enrichment_test, showCategory=10, font.size = 6)


## ----scGPS_prediction, warning = FALSE, message = FALSE-----------------------

#select a subpopulation, and input gene list
c_selectID <- 1
#note make sure the format for genes input here is the same to the format
#for genes in the mixedpop1 and mixedpop2
genes = DEgenes$id[1:500]

#run the test bootstrap with nboots = 2 runs

cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]

LSOLDA_dat <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1,
                             mixedpop2 = mixedpop2, genes = genes, 
                             c_selectID  = c_selectID,
                             listData = list(),
                             cluster_mixedpop1 = cluster_mixedpop1,
                             cluster_mixedpop2 = cluster_mixedpop2)


## ----summarise_prediction-----------------------------------------------------
#get the number of rows for the summary matrix
row_cluster <-length(unique(colData(mixedpop2)[,1]))

#summary results LDA to to show the percent of cells classified as cells
#belonging by LDA classifier
summary_prediction_lda(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster )

#summary results Lasso to show the percent of cells classified as cells
#belonging by Lasso classifier
summary_prediction_lasso(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster)

# summary maximum deviance explained by the model during the model training
summary_deviance(object = LSOLDA_dat)

# summary accuracy to check the model accuracy in the leave-out test set
summary_accuracy(object = LSOLDA_dat)


## ----prediction_one_sample, warning = FALSE, message = FALSE, fig.width=5-----
#run prediction for 3 clusters
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
#cluster_mixedpop2 <- as.numeric(as.vector(colData(mixedpop2)[,1]))

c_selectID <- 1
#top 200 gene markers distinguishing cluster 1
genes = DEgenes$id[1:200]

LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 2
genes = DEgenes$id[1:200]

LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 3
genes = DEgenes$id[1:200]
LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 4
genes = DEgenes$id[1:200]
LSOLDA_dat4 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop2,
                        cluster_mixedpop2 = cluster_mixedpop2)


#prepare table input for sankey plot

LASSO_C1S2  <- reformat_LASSO(c_selectID=1, mp_selectID = 2,
                             LSOLDA_dat=LSOLDA_dat1,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#7570b3")

LASSO_C2S2  <- reformat_LASSO(c_selectID=2, mp_selectID =2,
                             LSOLDA_dat=LSOLDA_dat2,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#1b9e77")

LASSO_C3S2  <- reformat_LASSO(c_selectID=3, mp_selectID =2,
                             LSOLDA_dat=LSOLDA_dat3,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#e7298a")

LASSO_C4S2  <- reformat_LASSO(c_selectID=4, mp_selectID =2,
                             LSOLDA_dat=LSOLDA_dat4,
                             nPredSubpop = length(unique(colData(mixedpop2)
                                [,1])),
                             Nodes_group ="#00FFFF")

combined <- rbind(LASSO_C1S2,LASSO_C2S2,LASSO_C3S2, LASSO_C4S2 )
combined <- combined[is.na(combined$Value) != TRUE,]

nboots = 2
#links: source, target, value
#source: node, nodegroup
combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)],
                     Links=combined[,c((nboots+2):(nboots+1),ncol(combined))])

library(networkD3)

Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source)))
Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target)))
Node_all <-unique(c(Node_source, Node_target))

#assign IDs for Source (start from 0)
Source <-combined_D3obj$Links$Source
Target <- combined_D3obj$Links$Target

for(i in 1:length(Node_all)){
   Source[Source==Node_all[i]] <-i-1
   Target[Target==Node_all[i]] <-i-1
}
# 
combined_D3obj$Links$Source <- as.numeric(Source)
combined_D3obj$Links$Target <- as.numeric(Target)
combined_D3obj$Links$LinkColor <- combined$NodeGroup

#prepare node info
node_df <-data.frame(Node=Node_all)
node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1)))

suppressMessages(library(dplyr))
Color <- combined %>% count(Node, color=NodeGroup) %>% select(2)
node_df$color <- Color$color

suppressMessages(library(networkD3))
p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df,
                 Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor", 
                 NodeID="Node", Source="Source", Target="Target", fontSize = 22)
p1

#saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html'))


## ----prediction_two_samples,warning = FALSE, message = FALSE, fig.width=5-----
#run prediction for 3 clusters
cluster_mixedpop1 <- colData(mixedpop1)[,1]
cluster_mixedpop2 <- colData(mixedpop2)[,1]
row_cluster <-length(unique(colData(mixedpop2)[,1]))

c_selectID <- 1
#top 200 gene markers distinguishing cluster 1
genes = DEgenes$id[1:200]
LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop1,
                        cluster_mixedpop2 = cluster_mixedpop2)


c_selectID <- 2
genes = DEgenes$id[1:200]
LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop1,
                        cluster_mixedpop2 = cluster_mixedpop2)

c_selectID <- 3
genes = DEgenes$id[1:200]
LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1,
                        mixedpop2 = mixedpop2, genes=genes, c_selectID, 
                        listData =list(),
                        cluster_mixedpop1 = cluster_mixedpop1,
                        cluster_mixedpop2 = cluster_mixedpop2)

#prepare table input for sankey plot

LASSO_C1S1  <- reformat_LASSO(c_selectID=1, mp_selectID = 1,
                             LSOLDA_dat=LSOLDA_dat1, nPredSubpop = row_cluster, 
                             Nodes_group = "#7570b3")

LASSO_C2S1  <- reformat_LASSO(c_selectID=2, mp_selectID = 1,
                             LSOLDA_dat=LSOLDA_dat2, nPredSubpop = row_cluster, 
                             Nodes_group = "#1b9e77")

LASSO_C3S1  <- reformat_LASSO(c_selectID=3, mp_selectID = 1,
                             LSOLDA_dat=LSOLDA_dat3, nPredSubpop = row_cluster, 
                             Nodes_group = "#e7298a")


combined <- rbind(LASSO_C1S1,LASSO_C2S1,LASSO_C3S1)

nboots = 2
#links: source, target, value
#source: node, nodegroup
combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)],
                     Links=combined[,c((nboots+2):(nboots+1),ncol(combined))])
combined <- combined[is.na(combined$Value) != TRUE,]


library(networkD3)

Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source)))
Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target)))
Node_all <-unique(c(Node_source, Node_target))

#assign IDs for Source (start from 0)
Source <-combined_D3obj$Links$Source
Target <- combined_D3obj$Links$Target

for(i in 1:length(Node_all)){
   Source[Source==Node_all[i]] <-i-1
   Target[Target==Node_all[i]] <-i-1
}

combined_D3obj$Links$Source <- as.numeric(Source)
combined_D3obj$Links$Target <- as.numeric(Target)
combined_D3obj$Links$LinkColor <- combined$NodeGroup

#prepare node info
node_df <-data.frame(Node=Node_all)
node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1)))

suppressMessages(library(dplyr))
n <- length(unique(node_df$Node))
getPalette = colorRampPalette(RColorBrewer::brewer.pal(9, "Set1"))
Color = getPalette(n)
node_df$color <- Color
suppressMessages(library(networkD3))
p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df,
                 Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor",
                 NodeID="Node", Source="Source", Target="Target", fontSize = 22)
p1
#saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html'))

## ----session_info, include=TRUE, echo=TRUE, results='markup'------------------
devtools::session_info()