## ----setup, eval = TRUE, echo = FALSE----------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----installation, eval = FALSE----------------------------------------------- # # To install scGPS from github (Depending on the configuration of the local # # computer or HPC, possible custom C++ compilation may be required - see # # installation trouble-shootings below) # devtools::install_github("IMB-Computational-Genomics-Lab/scGPS") # # # for C++ compilation trouble-shooting, manual download and installation can be # # done from github # # git clone https://github.com/IMB-Computational-Genomics-Lab/scGPS # # # then check in scGPS/src if any of the precompiled (e.g. those with *.so and # # *.o) files exist and delete them before recompiling # # # then with the scGPS as the R working directory, manually install and load # # using devtools functionality # # Install the package # devtools::install() # #load the package to the workspace # library(scGPS) # ## ----scGPS_object, warning = FALSE, message = FALSE--------------------------- # load mixed population 1 (loaded from day_2_cardio_cell_sample dataset, # named it as day2) library(scGPS) day2 <- day_2_cardio_cell_sample mixedpop1 <- new_scGPS_object(ExpressionMatrix = day2$dat2_counts, GeneMetadata = day2$dat2geneInfo, CellMetadata = day2$dat2_clusters) # load mixed population 2 (loaded from day_5_cardio_cell_sample dataset, # named it as day5) day5 <- day_5_cardio_cell_sample mixedpop2 <- new_scGPS_object(ExpressionMatrix = day5$dat5_counts, GeneMetadata = day5$dat5geneInfo, CellMetadata = day5$dat5_clusters) ## ----prediction, warning = FALSE, message = FALSE----------------------------- # select a subpopulation c_selectID <- 1 # load gene list (this can be any lists of user selected genes) genes <- training_gene_sample genes <- genes$Merged_unique # load cluster information cluster_mixedpop1 <- colData(mixedpop1)[,1] cluster_mixedpop2 <- colData(mixedpop2)[,1] #run training (running nboots = 3 here, but recommend to use nboots = 50-100) LSOLDA_dat <- bootstrap_prediction(nboots = 3, mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, genes = genes, c_selectID = c_selectID, listData = list(), cluster_mixedpop1 = cluster_mixedpop1, cluster_mixedpop2 = cluster_mixedpop2, trainset_ratio = 0.7) names(LSOLDA_dat) ## ----summarise_results, warning = FALSE, message = FALSE---------------------- # summary results LDA sum_pred_lda <- summary_prediction_lda(LSOLDA_dat = LSOLDA_dat, nPredSubpop = 4) # summary results Lasso to show the percent of cells # classified as cells belonging sum_pred_lasso <- summary_prediction_lasso(LSOLDA_dat = LSOLDA_dat, nPredSubpop = 4) # plot summary results plot_sum <-function(sum_dat){ sum_dat_tf <- t(sum_dat) sum_dat_tf <- na.omit(sum_dat_tf) sum_dat_tf <- apply(sum_dat[, -ncol(sum_dat)],1, function(x){as.numeric(as.vector(x))}) sum_dat$names <- gsub("ElasticNet for subpop","sp", sum_dat$names ) sum_dat$names <- gsub("in target mixedpop","in p", sum_dat$names) sum_dat$names <- gsub("LDA for subpop","sp", sum_dat$names ) sum_dat$names <- gsub("in target mixedpop","in p", sum_dat$names) colnames(sum_dat_tf) <- sum_dat$names boxplot(sum_dat_tf, las=2) } plot_sum(sum_pred_lasso) plot_sum(sum_pred_lda) # summary accuracy to check the model accuracy in the leave-out test set summary_accuracy(object = LSOLDA_dat) # summary maximum deviance explained by the model summary_deviance(object = LSOLDA_dat) ## ----CORE, warning = FALSE, message = FALSE----------------------------------- # find clustering information in an expresion data using CORE day5 <- day_5_cardio_cell_sample cellnames <- colnames(day5$dat5_counts) cluster <-day5$dat5_clusters cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames) mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts, GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames) CORE_cluster <- CORE_clustering(mixedpop2, remove_outlier = c(0), PCA=FALSE) # to update the clustering information, users can ... key_height <- CORE_cluster$optimalClust$KeyStats$Height optimal_res <- CORE_cluster$optimalClust$OptimalRes optimal_index = which(key_height == optimal_res) clustering_after_outlier_removal <- unname(unlist( CORE_cluster$Cluster[[optimal_index]])) corresponding_cells_after_outlier_removal <- CORE_cluster$cellsForClustering original_cells_before_removal <- colData(mixedpop2)[,2] corresponding_index <- match(corresponding_cells_after_outlier_removal, original_cells_before_removal ) # check the matching identical(as.character(original_cells_before_removal[corresponding_index]), corresponding_cells_after_outlier_removal) # create new object with the new clustering after removing outliers mixedpop2_post_clustering <- mixedpop2[,corresponding_index] colData(mixedpop2_post_clustering)[,1] <- clustering_after_outlier_removal ## ----SCORE with bagging, warning = FALSE, message = FALSE--------------------- # find clustering information in an expresion data using SCORE day5 <- day_5_cardio_cell_sample cellnames <- colnames(day5$dat5_counts) cluster <-day5$dat5_clusters cellnames <-data.frame("Cluster"=cluster, "cellBarcodes" = cellnames) mixedpop2 <-new_scGPS_object(ExpressionMatrix = day5$dat5_counts, GeneMetadata = day5$dat5geneInfo, CellMetadata = cellnames ) SCORE_test <- CORE_bagging(mixedpop2, remove_outlier = c(0), PCA=FALSE, bagging_run = 20, subsample_proportion = .8) ## ----visualisation, dev='CairoPDF'-------------------------------------------- dev.off() ##3.2.1 plot CORE clustering p1 <- plot_CORE(CORE_cluster$tree, CORE_cluster$Cluster, color_branch = c("#208eb7", "#6ce9d3", "#1c5e39", "#8fca40", "#154975", "#b1c8eb")) p1 #extract optimal index identified by CORE key_height <- CORE_cluster$optimalClust$KeyStats$Height optimal_res <- CORE_cluster$optimalClust$OptimalRes optimal_index = which(key_height == optimal_res) #plot one optimal clustering bar plot_optimal_CORE(original_tree= CORE_cluster$tree, optimal_cluster = unlist(CORE_cluster$Cluster[optimal_index]), shift = -2000) ##3.2.2 plot SCORE clustering #plot all clustering bars plot_CORE(SCORE_test$tree, list_clusters = SCORE_test$Cluster) #plot one stable optimal clustering bar plot_optimal_CORE(original_tree= SCORE_test$tree, optimal_cluster = unlist(SCORE_test$Cluster[ SCORE_test$optimal_index]), shift = -100) ## ----compare_clustering, fig.width=5, fig.height=5---------------------------- t <- tSNE(expression.mat=assay(mixedpop2)) p2 <-plot_reduced(t, color_fac = factor(colData(mixedpop2)[,1]), palletes =1:length(unique(colData(mixedpop2)[,1]))) p2 ## ----find_markers, warning = FALSE, message = FALSE, fig.width=7-------------- #load gene list (this can be any lists of user-selected genes) genes <-training_gene_sample genes <-genes$Merged_unique #the gene list can also be objectively identified by differential expression #analysis cluster information is requied for find_markers. Here, we use #CORE results. #colData(mixedpop2)[,1] <- unlist(SCORE_test$Cluster[SCORE_test$optimal_index]) suppressMessages(library(locfit)) DEgenes <- find_markers(expression_matrix=assay(mixedpop2), cluster = colData(mixedpop2)[,1], selected_cluster=unique(colData(mixedpop2)[,1])) #the output contains dataframes for each cluster. #the data frame contains all genes, sorted by p-values names(DEgenes) #you can annotate the identified clusters DEgeneList_1vsOthers <- DEgenes$DE_Subpop1vsRemaining$id #users need to check the format of the gene input to make sure they are #consistent to the gene names in the expression matrix #the following command saves the file "PathwayEnrichment.xlsx" to the #working dir #use 500 top DE genes suppressMessages(library(DOSE)) suppressMessages(library(ReactomePA)) suppressMessages(library(clusterProfiler)) genes500 <- as.factor(DEgeneList_1vsOthers[seq_len(500)]) enrichment_test <- annotate_clusters(genes, pvalueCutoff=0.05, gene_symbol=TRUE) #the enrichment outputs can be displayed by running clusterProfiler::dotplot(enrichment_test, showCategory=10, font.size = 6) ## ----scGPS_prediction, warning = FALSE, message = FALSE----------------------- #select a subpopulation, and input gene list c_selectID <- 1 #note make sure the format for genes input here is the same to the format #for genes in the mixedpop1 and mixedpop2 genes = DEgenes$id[1:500] #run the test bootstrap with nboots = 2 runs cluster_mixedpop1 <- colData(mixedpop1)[,1] cluster_mixedpop2 <- colData(mixedpop2)[,1] LSOLDA_dat <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, genes = genes, c_selectID = c_selectID, listData = list(), cluster_mixedpop1 = cluster_mixedpop1, cluster_mixedpop2 = cluster_mixedpop2) ## ----summarise_prediction----------------------------------------------------- #get the number of rows for the summary matrix row_cluster <-length(unique(colData(mixedpop2)[,1])) #summary results LDA to to show the percent of cells classified as cells #belonging by LDA classifier summary_prediction_lda(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster ) #summary results Lasso to show the percent of cells classified as cells #belonging by Lasso classifier summary_prediction_lasso(LSOLDA_dat=LSOLDA_dat, nPredSubpop = row_cluster) # summary maximum deviance explained by the model during the model training summary_deviance(object = LSOLDA_dat) # summary accuracy to check the model accuracy in the leave-out test set summary_accuracy(object = LSOLDA_dat) ## ----prediction_one_sample, warning = FALSE, message = FALSE, fig.width=5----- #run prediction for 3 clusters cluster_mixedpop1 <- colData(mixedpop1)[,1] cluster_mixedpop2 <- colData(mixedpop2)[,1] #cluster_mixedpop2 <- as.numeric(as.vector(colData(mixedpop2)[,1])) c_selectID <- 1 #top 200 gene markers distinguishing cluster 1 genes = DEgenes$id[1:200] LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop2, mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(), cluster_mixedpop1 = cluster_mixedpop2, cluster_mixedpop2 = cluster_mixedpop2) c_selectID <- 2 genes = DEgenes$id[1:200] LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2, mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(), cluster_mixedpop1 = cluster_mixedpop2, cluster_mixedpop2 = cluster_mixedpop2) c_selectID <- 3 genes = DEgenes$id[1:200] LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2, mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(), cluster_mixedpop1 = cluster_mixedpop2, cluster_mixedpop2 = cluster_mixedpop2) c_selectID <- 4 genes = DEgenes$id[1:200] LSOLDA_dat4 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop2, mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(), cluster_mixedpop1 = cluster_mixedpop2, cluster_mixedpop2 = cluster_mixedpop2) #prepare table input for sankey plot LASSO_C1S2 <- reformat_LASSO(c_selectID=1, mp_selectID = 2, LSOLDA_dat=LSOLDA_dat1, nPredSubpop = length(unique(colData(mixedpop2) [,1])), Nodes_group ="#7570b3") LASSO_C2S2 <- reformat_LASSO(c_selectID=2, mp_selectID =2, LSOLDA_dat=LSOLDA_dat2, nPredSubpop = length(unique(colData(mixedpop2) [,1])), Nodes_group ="#1b9e77") LASSO_C3S2 <- reformat_LASSO(c_selectID=3, mp_selectID =2, LSOLDA_dat=LSOLDA_dat3, nPredSubpop = length(unique(colData(mixedpop2) [,1])), Nodes_group ="#e7298a") LASSO_C4S2 <- reformat_LASSO(c_selectID=4, mp_selectID =2, LSOLDA_dat=LSOLDA_dat4, nPredSubpop = length(unique(colData(mixedpop2) [,1])), Nodes_group ="#00FFFF") combined <- rbind(LASSO_C1S2,LASSO_C2S2,LASSO_C3S2, LASSO_C4S2 ) combined <- combined[is.na(combined$Value) != TRUE,] nboots = 2 #links: source, target, value #source: node, nodegroup combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)], Links=combined[,c((nboots+2):(nboots+1),ncol(combined))]) library(networkD3) Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source))) Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target))) Node_all <-unique(c(Node_source, Node_target)) #assign IDs for Source (start from 0) Source <-combined_D3obj$Links$Source Target <- combined_D3obj$Links$Target for(i in 1:length(Node_all)){ Source[Source==Node_all[i]] <-i-1 Target[Target==Node_all[i]] <-i-1 } # combined_D3obj$Links$Source <- as.numeric(Source) combined_D3obj$Links$Target <- as.numeric(Target) combined_D3obj$Links$LinkColor <- combined$NodeGroup #prepare node info node_df <-data.frame(Node=Node_all) node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1))) suppressMessages(library(dplyr)) Color <- combined %>% count(Node, color=NodeGroup) %>% select(2) node_df$color <- Color$color suppressMessages(library(networkD3)) p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df, Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor", NodeID="Node", Source="Source", Target="Target", fontSize = 22) p1 #saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html')) ## ----prediction_two_samples,warning = FALSE, message = FALSE, fig.width=5----- #run prediction for 3 clusters cluster_mixedpop1 <- colData(mixedpop1)[,1] cluster_mixedpop2 <- colData(mixedpop2)[,1] row_cluster <-length(unique(colData(mixedpop2)[,1])) c_selectID <- 1 #top 200 gene markers distinguishing cluster 1 genes = DEgenes$id[1:200] LSOLDA_dat1 <- bootstrap_prediction(nboots = 2, mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(), cluster_mixedpop1 = cluster_mixedpop1, cluster_mixedpop2 = cluster_mixedpop2) c_selectID <- 2 genes = DEgenes$id[1:200] LSOLDA_dat2 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(), cluster_mixedpop1 = cluster_mixedpop1, cluster_mixedpop2 = cluster_mixedpop2) c_selectID <- 3 genes = DEgenes$id[1:200] LSOLDA_dat3 <- bootstrap_prediction(nboots = 2,mixedpop1 = mixedpop1, mixedpop2 = mixedpop2, genes=genes, c_selectID, listData =list(), cluster_mixedpop1 = cluster_mixedpop1, cluster_mixedpop2 = cluster_mixedpop2) #prepare table input for sankey plot LASSO_C1S1 <- reformat_LASSO(c_selectID=1, mp_selectID = 1, LSOLDA_dat=LSOLDA_dat1, nPredSubpop = row_cluster, Nodes_group = "#7570b3") LASSO_C2S1 <- reformat_LASSO(c_selectID=2, mp_selectID = 1, LSOLDA_dat=LSOLDA_dat2, nPredSubpop = row_cluster, Nodes_group = "#1b9e77") LASSO_C3S1 <- reformat_LASSO(c_selectID=3, mp_selectID = 1, LSOLDA_dat=LSOLDA_dat3, nPredSubpop = row_cluster, Nodes_group = "#e7298a") combined <- rbind(LASSO_C1S1,LASSO_C2S1,LASSO_C3S1) nboots = 2 #links: source, target, value #source: node, nodegroup combined_D3obj <-list(Nodes=combined[,(nboots+3):(nboots+4)], Links=combined[,c((nboots+2):(nboots+1),ncol(combined))]) combined <- combined[is.na(combined$Value) != TRUE,] library(networkD3) Node_source <- as.vector(sort(unique(combined_D3obj$Links$Source))) Node_target <- as.vector(sort(unique(combined_D3obj$Links$Target))) Node_all <-unique(c(Node_source, Node_target)) #assign IDs for Source (start from 0) Source <-combined_D3obj$Links$Source Target <- combined_D3obj$Links$Target for(i in 1:length(Node_all)){ Source[Source==Node_all[i]] <-i-1 Target[Target==Node_all[i]] <-i-1 } combined_D3obj$Links$Source <- as.numeric(Source) combined_D3obj$Links$Target <- as.numeric(Target) combined_D3obj$Links$LinkColor <- combined$NodeGroup #prepare node info node_df <-data.frame(Node=Node_all) node_df$id <-as.numeric(c(0, 1:(length(Node_all)-1))) suppressMessages(library(dplyr)) n <- length(unique(node_df$Node)) getPalette = colorRampPalette(RColorBrewer::brewer.pal(9, "Set1")) Color = getPalette(n) node_df$color <- Color suppressMessages(library(networkD3)) p1<-sankeyNetwork(Links =combined_D3obj$Links, Nodes = node_df, Value = "Value", NodeGroup ="color", LinkGroup = "LinkColor", NodeID="Node", Source="Source", Target="Target", fontSize = 22) p1 #saveNetwork(p1, file = paste0(path,'Subpopulation_Net.html')) ## ----session_info, include=TRUE, echo=TRUE, results='markup'------------------ devtools::session_info()