## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----load_libs, warning = FALSE, message = FALSE, include = TRUE-------------- library(GAPGOM) library(profvis) library(GO.db) library(graph) library(ggplot2) library(reshape2) library(Biobase) ## ---- eval=FALSE-------------------------------------------------------------- # # Example with default dataset, take a look at the data documentation # # to fully grasp what's going on with making of the filter etc. (Biobase # # ExpressionSet) # # # keep everything that is a protein coding gene # filter_vector <- expset@featureData@data[(expset@featureData@data$GeneType=="protein_coding"),]$GeneID # # set gid and run. # gid <- "ENSG00000228630" # # p <- profvis::profvis({GAPGOM::expression_prediction(gid, # GAPGOM::expset, # "human", # "BP", # id_translation_df = GAPGOM::id_translation_df, # id_select_vector = filter_vector, # method = "combine", verbose = TRUE, # filter_pvals = TRUE # )}) # time <- max(p$x$message$prof$time)*10 # mem <- max(p$x$message$prof$memalloc) ## ----------------------------------------------------------------------------- # laptop # time 310 # mem 124.91 # --- # server # time 560 # mem 72.47 ## ---- eval=FALSE-------------------------------------------------------------- # # prepare the godata for mouse and some other calculations later needed in benchmarking # organism <- "human" # ontology <- "BP" # go_data <- GAPGOM::set_go_data(organism, ontology) ## ----topotitj, eval=FALSE----------------------------------------------------- # # grab 15 random GOs (for term algorithm) # ## sample(unique(go_data@geneAnno$GO), 15) # random_gos <- c("GO:0030177", "GO:0001771", "GO:0045715", "GO:0044330", "GO:0098780", # "GO:1901006", "GO:0061143", "GO:0060025", "GO:0015695", "GO:0090074", # "GO:0035445", "GO:0008595", "GO:1903634", "GO:1903826", "GO:0048389" # ) # # print them for reproducability # ## dput(random_gos) # # now compare all unique random GO pairs. (105 uniques). # unique_pairs <- GAPGOM:::.unique_combos(random_gos, random_gos) # # times <- c() # mem_usages <- c() # for (i in seq_len(nrow(unique_pairs))) { # prof_toptitj <- profvis({ # pair <- unique_pairs[i] # go1 <- pair[[1]] # go2 <- pair[[2]] # GAPGOM::topo_ic_sim_term(organism, ontology, go1, go2, go_data = go_data) # }) # time <- max(prof_toptitj$x$message$prof$time)*10 # mem <- max(prof_toptitj$x$message$prof$memalloc) # mem_usages <- c(mem_usages, mem) # times <- c(times, time) # gc() # } # times_term <- times # mems_term <- mem_usages ## ---- fig.width=3, fig.height=3----------------------------------------------- times_term_df <- data.frame(GAPGOM:::benchmarks$server_times_term, GAPGOM:::benchmarks$laptop_times_term, seq_len(105) ) colnames(times_term_df) <- c("server", "laptop", "n") times_term_df_melted <- melt(times_term_df, id="n") colnames(times_term_df_melted) <- c("n", "machine", "milliseconds") ggplot(times_term_df_melted, aes(x=machine, y=milliseconds, colour=machine)) + geom_boxplot(notch = FALSE) + scale_y_continuous(breaks=pretty(times_term_df_melted$milliseconds, n = 5)) + labs(title = paste(strwrap("Speed of term algorithm", width = 20), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) ## ---- fig.width=3, fig.height=3----------------------------------------------- mems_term_df <- data.frame(GAPGOM:::benchmarks$server_mems_term, GAPGOM:::benchmarks$laptop_mems_term, seq_len(105) ) colnames(mems_term_df) <- c("server", "laptop", "n") mems_term_df_melted <- melt(times_term_df, id="n") colnames(mems_term_df_melted) <- c("n", "machine", "RAM") ggplot(mems_term_df_melted, aes(x=machine, y=RAM, colour=machine)) + geom_boxplot(notch = FALSE) + scale_y_continuous(breaks=pretty(mems_term_df_melted$RAM, n = 5)) + labs(title = paste(strwrap("RAM usage for gene algorithm", width = 20), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) ## ---- eval=FALSE-------------------------------------------------------------- # ## dput(sample(unique(go_data@geneAnno$ENTREZID), 5)) # random_genes <- c("3848", "2824", "65108", "3988", "10800") # # unique_pairs <- GAPGOM:::.unique_combos(random_genes, random_genes) # # times <- c() # mem_usages <- c() # for (i in seq_len(nrow(unique_pairs))) { # prof_topg1g2 <- profvis({ # pair <- unique_pairs[i] # gene1 <- pair[[1]] # gene2 <- pair[[2]] # GAPGOM::topo_ic_sim_genes(organism, ontology, gene1, gene2, go_data=go_data) # }) # time <- max(prof_topg1g2$x$message$prof$time)*10 # mem <- max(prof_topg1g2$x$message$prof$memalloc) # mem_usages <- c(mem_usages, mem) # times <- c(times, time) # gc() # } # times # mem_usages # times_gen <- times # mems_gen <- mem_usages ## ---- fig.width=3, fig.height=3----------------------------------------------- times_gene_df <- data.frame(GAPGOM:::benchmarks$server_times_gen, GAPGOM:::benchmarks$laptop_times_gen, seq_len(10) ) colnames(times_gene_df) <- c("server", "laptop", "n") times_gene_df_melted <- melt(times_gene_df, id="n") times_gene_df_melted$value <- times_gene_df_melted$value/1000 colnames(times_gene_df_melted) <- c("n", "machine", "seconds") ggplot(times_gene_df_melted, aes(x=machine, y=seconds, colour=machine)) + geom_boxplot(notch = FALSE) + labs(title = paste(strwrap("Speed of gene algorithm", width = 20), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) ## ---- fig.width=3, fig.height=3----------------------------------------------- mems_gene_df <- data.frame(GAPGOM:::benchmarks$server_mems_gen, GAPGOM:::benchmarks$laptop_mems_gen, seq_len(10) ) colnames(mems_gene_df) <- c("server", "laptop", "n") mems_gene_df_melted <- melt(mems_gene_df, id="n") colnames(mems_gene_df_melted) <- c("n", "machine", "RAM") ggplot(mems_gene_df_melted, aes(x=machine, y=RAM, colour=machine)) + geom_boxplot(notch = FALSE) + scale_y_continuous(breaks=pretty(mems_gene_df_melted$RAM, n = 5)) + labs(title = paste(strwrap("RAM usage for gene algorithm", width = 20), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) ## ---- eval=FALSE-------------------------------------------------------------- # list1=c("126133","221","218","216","8854","220","219","160428","224","222","8659","501","64577","223","217","4329","10840","7915","5832") # times <- c() # mem_usages <- c() # for (i in seq(length(list1)-1)) { # sampled_list <- list1[1:(i+1)] # print(sampled_list) # p <- profvis({ # GAPGOM::topo_ic_sim_genes(organism, ontology, sampled_list, sampled_list, drop=NULL, go_data=go_data) # }) # time <- max(p$x$message$prof$time)*10 # mem <- max(p$x$message$prof$memalloc) # mem_usages <- c(mem_usages, mem) # times <- c(times, time) # gc() # } # times # mem_usages # times_genset <- times # mems_genset <- mem_usages ## ---- fig.width=6, fig.height=3----------------------------------------------- final_times <- data.frame(GAPGOM:::benchmarks$server_times_genset, GAPGOM:::benchmarks$laptop_times_genset, seq_along(GAPGOM:::benchmarks$laptop_times_genset) ) colnames(final_times) <- c("server", "laptop", "n") final_times_melted <- melt(final_times, id="n") final_times_melted$value <- final_times_melted$value/1000/60 colnames(final_times_melted) <- c("Genes in geneset", "Machine", "Minutes") p <- ggplot(data=final_times_melted, aes(x=`Genes in geneset`, y=Minutes, colour=Machine)) + geom_point() + labs(title = paste(strwrap("Speed of TopoICSim geneset algorithm", width = 20), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) p p + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1) ## ---- fig.width=3, fig.height=3----------------------------------------------- mems_geneset_df <- data.frame(GAPGOM:::benchmarks$server_mems_genset, GAPGOM:::benchmarks$laptop_mems_genset, seq_len(18) ) colnames(mems_geneset_df) <- c("server", "laptop", "n") mems_geneset_df_melted <- melt(mems_geneset_df, id="n") colnames(mems_geneset_df_melted) <- c("Genes in geneset", "machine", "RAM") ggplot(mems_geneset_df_melted, aes(x=machine, y=RAM, colour=machine)) + geom_boxplot(notch = FALSE) + scale_y_continuous(breaks=pretty(mems_geneset_df_melted$RAM, n = 5)) + labs(title = paste(strwrap("RAM usage for geneset algorithm", width = 20), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) ## ---- fig.width=6, fig.height=3----------------------------------------------- p <- ggplot(mems_geneset_df_melted, aes(x=`Genes in geneset`, y=RAM, colour=machine)) + geom_point() + scale_y_continuous(breaks=pretty(mems_geneset_df_melted$RAM, n = 5)) + labs(title = paste(strwrap("RAM usage for geneset algorithm", width = 20), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) p p + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1) ## ---- eval=FALSE-------------------------------------------------------------- # ## set stringsasfactors to false to ensure data is loaded properly. # options(stringsAsFactors = FALSE) # # ## load data # # # expdata <- read.table("~/Downloads/GSE63733_m.txt") # # http://software.broadinstitute.org/gsea/msigdb/cards/HALLMARK_GLYCOLYSIS.html # # GLYCOLOSIS HALLMARK # geneset <- read.table("~/Downloads/geneset_glyc.txt", sep="\n") # # colnames(expdata)[1:2] <- c("ENSEMBL", "SYMBOL") # Set important colnames # # ## make an expressionset # # expression_matrix <- as.matrix(expdata[,3:ncol(expdata)]) # rownames(expression_matrix) <- expdata[[1]] # featuredat <- as.data.frame(expdata[,1:2]) # rownames(featuredat) <- expdata[[1]] # expset <- ExpressionSet(expression_matrix, # featureData = new("AnnotatedDataFrame", # data=featuredat)) # # ## make a selection on the geneset # geneset <- as.vector(geneset[3:nrow(geneset),]) # geneset_ensembl <- expdata[expdata[[2]] %in% geneset,][[1]] # # select on the 10 genes with the most variance # geneset_ensembl_topvar <- names(rev(sort(apply( # expression_matrix[geneset_ensembl,], 1, var)))[1:10]) # # convert back to symbol # geneset_selection <- expdata[expdata$ENSEMBL %in% geneset_ensembl_topvar,][[2]] # # ## predict annotation of the geneset selection (per ontology). # # ontologies <- c("MF", "BP", "CC") # bench_results <- list() # for (ontology in ontologies) { # # print("---") # # print(ontology) # predicted_annotations <- list() # for (gene in geneset_ensembl_topvar) { # symbol <- expdata[expdata$ENSEMBL==gene,]$SYMBOL # # result <- GAPGOM::expression_prediction(gene, # expset, # "human", # ontology, # idtype = "ENSEMBL") # go_annotation_prediction <- unique(as.vector(result$GOID)) # predicted_annotations[[symbol]] <- go_annotation_prediction # } # print("Prediction finished! Calculating sims now") # # ## compare custom genes to original genes # tmp_go_data <- set_go_data("human", ontology, computeIC = TRUE, # keytype = "SYMBOL") # bench_results[[ontology]] <- mapply( # function(gos, name) { # custom_gene <- list() # custom_gene[[paste0(name, "_pred")]] <- gos # # name_orig <- unlist(strsplit(name, "_"), # FALSE, FALSE)[1] # # # print(name_orig) # # return(GAPGOM::topo_ic_sim_genes("human", ontology, c(), name_orig, # custom_genes1 = custom_gene, # idtype = "SYMBOL", # progress_bar = FALSE, go_data = tmp_go_data)$GeneSim) # }, gos = predicted_annotations, name = names(predicted_annotations) # ) # } # # save(bench_results, file = "~/tmp_bench.RData") ## ----------------------------------------------------------------------------- bench_results <- GAPGOM:::benchmarks$bench_results ontologies <- c("MF", "BP", "CC") for (ontology in ontologies) { dat <- data.frame(Gene = names(bench_results[[ontology]]), sim = bench_results[[ontology]]) brplot <- ggplot(dat, aes(x=Gene, weight=sim)) + geom_bar() + coord_flip() + labs(title = paste(strwrap(paste0( "Similarities of custom genes versus originals in ", ontology, " Ontology"), width = 30), collapse = "\n")) + theme(plot.title = element_text(hjust = 0.5)) + ylab("Similarity (TopoICSim)") print("---") print(ontology) print("mean:") print(mean(bench_results[[ontology]])) print(brplot) } ## ----------------------------------------------------------------------------- sessionInfo()