## ----setup, message=FALSE, warning=FALSE, include=FALSE----------------------- knitr::opts_chunk$set(collapse = TRUE, warning = TRUE, fig.wide = TRUE, cache = FALSE, crop = FALSE) suppressPackageStartupMessages(library("org.Hs.eg.db")) genesKegg <- base::as.list(org.Hs.eg.db::org.Hs.egPATH) genesReact <- base::as.list(reactome.db::reactomeEXTID2PATHID) # Remove genes and pathways which are not from human pathways genesReact <- lapply(genesReact, function(x) { grep("R-HSA-", x, value = TRUE) }) genesReact <- genesReact[lengths(genesReact) >= 1] library("BioCor") ## ----merging------------------------------------------------------------------ kegg <- mgeneSim(c("672", "675", "10"), genesKegg) react <- mgeneSim(c("672", "675", "10"), genesReact) ## We can sum it adding a weight to each origin weighted.sum(c(kegg["672", "675"], react["672", "675"]), w = c(0.3, 0.7)) ## Or if we want to perform for all the matrix ## A list of matrices to merge sim <- list("kegg" = kegg, "react" = react) similarities(sim, weighted.sum, w = c(0.3, 0.7)) similarities(sim, weighted.prod, w = c(0.3, 0.7)) similarities(sim, prod) similarities(sim, mean) ## ----weighted----------------------------------------------------------------- weighted.mean(c(1, NA), w = c(0.5, 0.5), na.rm = TRUE) weighted.mean(c(1, 0.5, NA), w = c(0.5, 0.25, 0.25), na.rm = TRUE) weighted.sum(c(1, NA), w = c(0.5, 0.5)) weighted.sum(c(1, 0.5, NA), w = c(0.5, 0.25, 0.25)) weighted.prod(c(1, NA), w = c(0.5, 0.5)) weighted.prod(c(1, 0.5, NA), w = c(0.5, 0.25, 0.25)) ## ----simulate, fig.cap="Volcano plot. The airway data", fig.wide = TRUE, code_folding = "hide"---- suppressPackageStartupMessages(library("airway")) data("airway") library("DESeq2") dds <- DESeqDataSet(airway, design = ~ cell + dex) dds$dex <- relevel(dds$dex, "untrt") dds <- DESeq(dds) res <- results(dds, alpha = 0.05) summary(res) plot(res$log2FoldChange, -log10(res$padj), pch = 16, xlab = "log2FC", ylab = "-log10(p.ajd)", main = "Untreated vs treated" ) logFC <- 2.5 abline(v = c(-logFC, logFC), h = -log10(0.05), col = "red") ## ----BioCor------------------------------------------------------------------- fc <- res[abs(res$log2FoldChange) >= logFC & !is.na(res$padj), ] fc <- fc[fc$padj < 0.05, ] # Convert Ids (used later) genes <- select(org.Hs.eg.db, keys = rownames(res), keytype = "ENSEMBL", column = c("ENTREZID", "SYMBOL") ) genesFC <- genes[genes$ENSEMBL %in% rownames(fc), ] genesFC <- genesFC[!is.na(genesFC$ENTREZID), ] genesSim <- genesFC$ENTREZID names(genesSim) <- genesFC$SYMBOL genesSim <- genesSim[!duplicated(genesSim)] # Calculate the functional similarity sim <- mgeneSim(genes = genesSim, info = genesReact, method = "BMA") ## ----pval1, fig.cap="Functional similarity of genes with logFC above 2,5. Similar genes cluster together.", fig.width=15, fig.height=20---- nas <- apply(sim, 1, function(x) { all(is.na(x)) }) sim <- sim[!nas, !nas] MDSs <- cmdscale(1 - sim) plot(MDSs, type = "n", main = "BMA similarity", xlab = "MDS1", ylab = "MDS2") up <- genes[genes$ENSEMBL %in% rownames(fc)[fc$log2FoldChange >= logFC], "SYMBOL"] text(MDSs, labels = rownames(MDSs), col = ifelse(rownames(MDSs) %in% up, "black", "red")) abline(h = 0, v = 0) legend("top", legend = c("Up-regulated", "Down-regulated"), fill = c("black", "red")) ## ----setting, fig.cap="Volcano plot of the subset of 400 genes. This subset will be used in the following sections", code_folding = "hide"---- set.seed(250) subRes <- res[!is.na(res$log2FoldChange), ] subs <- sample.int(nrow(subRes), size = 400) subRes <- subRes[subs, ] g <- genes[genes$ENSEMBL %in% rownames(subRes), "ENTREZID"] gS <- mgeneSim(g[g %in% names(genesReact)], genesReact, "BMA") deg <- rownames(subRes[subRes$padj < 0.05 & !is.na(subRes$padj), ]) keep <- rownames(gS) %in% genes[genes$ENSEMBL %in% deg, "ENTREZID"] plot(subRes$log2FoldChange, -log10(subRes$padj), pch = 16, xlab = "log2FC", ylab = "-log10(p.ajd)", main = "Untreated vs treated" ) abline(v = c(-logFC, logFC), h = -log10(0.05), col = "red") ## ----cluster2, fig.cap="Distribution of scores between differentially expressed genes and those who aren't. The line indicates the mean score of the similarity between differentially expressed genes and those which aren't differentially expressed.", fig.wide = TRUE---- library("boot") # The mean of genes differentially expressed (scoreDEG <- mean(gS[!keep, keep], na.rm = TRUE)) b <- boot(data = gS, R = 1000, statistic = function(x, i) { g <- !rownames(x) %in% rownames(x)[i] mean(x[g, i], na.rm = TRUE) }) (p.val <- (1 + sum(b$t > scoreDEG)) / 1001) hist(b$t, main = "Distribution of scores", xlab = "Similarity score") abline(v = scoreDEG, col = "red") ## ----pval2, fig.cap="Distribution of the similarity within differentially expressed genes. The line indicates the mean funtional similarity whitin them.", fig.wide = TRUE---- (scoreW <- combineScores(gS[keep, keep], "avg")) b <- boot(data = gS, R = 1000, statistic = function(x, i) { mean(x[i, i], na.rm = TRUE) }) (p.val <- (1 + sum(b$t > scoreW)) / 1001) # P-value hist(b$t, main = "Distribution of scores", xlab = "Similarity score") abline(v = scoreW, col = "red") ## ----logfc1, fig.cap="Similarity of genes along abs(logFC). Assessing the similarity of genes according to their absolute log2 fold change.", fig.wide = TRUE, code_folding = "hide"---- s <- seq(0, max(abs(subRes$log2FoldChange)) + 0.05, by = 0.05) l <- sapply(s, function(x) { deg <- rownames(subRes[abs(subRes$log2FoldChange) >= x, ]) keep <- rownames(gS) %in% genes[genes$ENSEMBL %in% deg, "ENTREZID"] BetweenAbove <- mean(gS[keep, keep], na.rm = TRUE) AboveBelow <- mean(gS[keep, !keep], na.rm = TRUE) BetweenBelow <- mean(gS[!keep, !keep], na.rm = TRUE) c( "BetweenAbove" = BetweenAbove, "AboveBelow" = AboveBelow, "BetweenBelow" = BetweenBelow ) }) L <- as.data.frame(cbind(logfc = s, t(l))) plot(L$logfc, L$BetweenAbove, type = "l", xlab = "abs(log2) fold change", ylab = "Similarity score", main = "Similarity scores along logFC", col = "darkred" ) lines(L$logfc, L$AboveBelow, col = "darkgreen") lines(L$logfc, L$BetweenBelow, col = "black") legend("topleft", legend = c( "Between genes above and below threshold", "Whitin genes above threshold", "Whitin genes below threshold" ), fill = c("darkgreen", "darkred", "black") ) ## ----logfc2, fig.cap = "Functional similarity between the up-regulated and down-regulated genes.", code_folding = "hide"---- l <- sapply(s, function(x) { # Names of genes up and down regulated degUp <- rownames(subRes[subRes$log2FoldChange >= x, ]) degDown <- rownames(subRes[subRes$log2FoldChange <= -x, ]) # Translate to ids in gS keepUp <- rownames(gS) %in% genes[genes$ENSEMBL %in% degUp, "ENTREZID"] keepDown <- rownames(gS) %in% genes[genes$ENSEMBL %in% degDown, "ENTREZID"] # Calculate the mean similarity between each subgrup between <- mean(gS[keepUp, keepDown], na.rm = TRUE) c("UpVsDown" = between) }) L <- as.data.frame(cbind("logfc" = s, "UpVsDown" = l)) plot(L$logfc, L$UpVsDown, type = "l", xlab = "abs(log2) fold change threshold", ylab = "Similarity score", main = "Similarity scores along logFC" ) legend("topright", legend = "Up vs down regulated genes", fill = "black" ) ## ----newPathway, fig.wide = TRUE, eval = FALSE-------------------------------- # # Adding a new pathway "deg" to those genes # genesReact2 <- genesReact # diffGenes <- genes[genes$ENSEMBL %in% deg, "ENTREZID"] # genesReact2[diffGenes] <- sapply(genesReact[diffGenes], function(x) { # c(x, "deg") # }) # plot(ecdf(mgeneSim(names(genesReact), genesReact))) # curve(ecdf(mgeneSim(names(genesReact2), genesReact2)), color = "red") ## ----newPathway2, fig.wide=TRUE, fig.cap="The effect of adding a new pathway to a functional similarity. In red the same network as in black but with the added pathway.", warning=FALSE, message=FALSE---- library("Hmisc") genesReact2 <- genesReact diffGenes <- genes[genes$ENSEMBL %in% deg, "ENTREZID"] # Create the new pathway called deg genesReact2[diffGenes] <- sapply(genesReact[diffGenes], function(x) { c(x, "deg") }) ids <- unique(genes[genes$ENSEMBL %in% rownames(subRes), "ENTREZID"]) Ecdf(c( mgeneSim(ids, genesReact, method = "BMA"), mgeneSim(ids, genesReact2, method = "BMA") ), group = c(rep("Reactome", length(ids)^2), rep("Modified", length(ids)^2)), col = c("black", "red"), xlab = "Functional similarities", main = "Empirical cumulative distribution" ) ## ----combineSource, fig.cap = "Comparison of functional similarity in different databases."---- genesKegg <- as.list(org.Hs.egPATH) gSK <- mgeneSim(rownames(gS), genesKegg) mix <- combineSources(genesKegg, genesReact) gSMix <- mgeneSim(rownames(gS), mix) Ecdf(c(gS, gSK, gSMix), group = c( rep("Reactome", length(gS)), rep("Kegg", length(gSK)), rep("Mix", length(gSMix)) ), col = c("black", "red", "blue"), xlab = "Functional similarities", main = "ecdf" ) ## ----combineSource2, fig.cap = "Comparison of functional similarity in different gene sets."---- gSK2 <- mgeneSim(rownames(gS), genesKegg, method = "BMA") gS2 <- mgeneSim(rownames(gS), genesReact, method = "BMA") gSMix2 <- mgeneSim(rownames(gS), mix, method = "BMA") Ecdf(c(gS2, gSK2, gSMix2), group = c( rep("Reactome", length(gS)), rep("Kegg", length(gSK)), rep("Mix", length(gSMix)) ), col = c("black", "red", "blue"), xlab = "Functional similarities (BMA)", main = "ecdf" ) ## ----miRNA1------------------------------------------------------------------- library("targetscan.Hs.eg.db") ## select human mirna humanMirnaIdx <- grep("hsa", mappedkeys(targetscan.Hs.egMIRNA)) ## select seed-based families for human mirna humanMirna <- mappedkeys(targetscan.Hs.egMIRNA)[humanMirnaIdx] ## select targets of families humanMirnaFamilies <- unlist(mget(humanMirna, targetscan.Hs.egMIRBASE2FAMILY)) humanMirnaTargets <- mget(humanMirnaFamilies, revmap(targetscan.Hs.egTARGETS)) names(humanMirnaTargets) <- humanMirna # Restrict to miRNA with more than one target and less than 150 miRNAs <- sample(humanMirnaTargets[lengths(humanMirnaTargets) > 1 & lengths(humanMirnaTargets) < 150], 10) lengths(miRNAs) ## ----miRNA2------------------------------------------------------------------- cluster1 <- mclusterSim(miRNAs, genesReact, method = "BMA") knitr::kable(round(cluster1, 2), caption = "The similarity between miRNA", format = "html") ## ----GOSemSim, eval=FALSE,fig.cap="Comparison of similarities. Functional similarities compared to biological process semantic similarity.",fig.wide=TRUE,include=TRUE---- # library("GOSemSim") # BP <- godata("org.Hs.eg.db", ont = "BP", computeIC = TRUE) # gsGO <- GOSemSim::mgeneSim(rownames(gS), semData = BP, measure = "Resnik", verbose = FALSE) # keep <- rownames(gS) %in% rownames(gsGO) # hist(as.dist(gS[keep, keep] - gsGO), # main = "Difference between functional similarity and biological process", # xlab = "Functional similarity - biological process similarity" # ) ## ----session, code_folding = "hide"------------------------------------------- sessionInfo()