## ----setup, include=FALSE----------------------------------------------------- options(fig_caption=TRUE) library(knitr) opts_chunk$set(out.extra='style="display:block; margin: auto"', fig.align="center") ## ----lib, warning=FALSE, message=FALSE, results="hide", include=FALSE--------- library(testthat) library(BioQC) library(hgu133plus2.db) ## to simulate an microarray expression dataset library(lattice) library(latticeExtra) library(gridExtra) library(gplots) library(reshape2) library(plyr) library(ggplot2) library(rbenchmark) pdf.options(family="ArialMT", useDingbats=FALSE) set.seed(1887) ## list human genes humanGenes <- unique(na.omit(unlist(as.list(hgu133plus2SYMBOL)))) ## read tissue-specific gene signatures gmtFile <- system.file("extdata/exp.tissuemark.affy.roche.symbols.gmt", package="BioQC") gmt <- readGmt(gmtFile) ## ----helper, include=FALSE---------------------------------------------------- ## Summarizes data. ## Gives count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%). ## data: a data frame. ## measurevar: the name of a column that contains the variable to be summariezed ## groupvars: a vector containing names of columns that contain grouping variables ## na.rm: a boolean that indicates whether to ignore NA's ## conf.interval: the percent range of the confidence interval (default is 95%) ## ## Source: http://www.cookbook-r.com/Manipulating_data/Summarizing_data/ summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) { # New version of length which can handle NA's: if na.rm==T, don't count them length2 <- function (x, na.rm=FALSE) { if (na.rm) sum(!is.na(x)) else length(x) } # This does the summary. For each group's data frame, return a vector with # N, mean, and sd datac <- ddply(data, groupvars, .drop=.drop, .fun = function(xx, col) { c(N = length2(xx[[col]], na.rm=na.rm), mean = mean (xx[[col]], na.rm=na.rm), sd = sd (xx[[col]], na.rm=na.rm) ) }, measurevar ) # Rename the "mean" column datac <- rename(datac, c("mean" = measurevar)) datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean # Confidence interval multiplier for standard error # Calculate t-statistic for confidence interval: # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 ciMult <- qt(conf.interval/2 + .5, datac$N-1) datac$ci <- datac$se * ciMult return(datac) } ## ----sensitivity_benchmark_fig, echo=FALSE, warning=FALSE, fig.width=8, fig.height=3.5, dev='png', fig.cap="**Figure 1:** Sensitivity benchmark. Expression levels of genes in the ovary signature are dedicately sampled randomly from normal distributions with different mean values. The lines show the enrichment score for the Wilcoxon-Mann-Whitney test, the t-test and the Kolmogorov-Smirnov test respectively. In the right panel, outliers were added by adding a random value to 1% of the simulated genes. "---- randomMatrixButOneSignature <- function(rows=humanGenes, signatureGenes, amplitudes=seq(0, 3, by=0.5)) { nrow <- length(rows) ncol <- length(amplitudes) mat <- matrix(rnorm(nrow*ncol), nrow=nrow, byrow=FALSE) rownames(mat) <- rows sigInd <- na.omit(match(signatureGenes, humanGenes)) colClass <- factor(amplitudes) for(colInd in unique(colClass)) { isCurrCol <- colInd==colClass replaceMatrix <- matrix(rnorm(length(sigInd)*sum(isCurrCol), mean=amplitudes[isCurrCol][1]), nrow=length(sigInd), byrow=FALSE) mat[sigInd, isCurrCol] <- replaceMatrix } return(mat) } addNoise = function(matrix, fractionAffected=.1, stdv=1) { noise = matrix(rnorm(nrow(matrix)*ncol(matrix), sd=stdv), nrow=nrow(matrix), byrow=FALSE) addNoise = matrix(runif(nrow(matrix)*ncol(matrix)), nrow=nrow(matrix), byrow=FALSE) < fractionAffected matrix = matrix + addNoise*noise return(matrix) } do.test <- function(senseMat, tissueInd, test=t.test, alternative="greater") { bgInds = setdiff(1:nrow(senseMat), tissueInd) res = apply(senseMat, 2, function(col) { gs = col[tissueInd] bg = col[bgInds] return(test(gs, bg, alternative=alternative)$p.value) }) return(res) } selGeneSet <- "Ovary_NGS_RNASEQATLAS_0.6_3" selSignature <- gmt[[selGeneSet]]$genes tissueInds <- sapply(gmt, function(x) match(x$genes, humanGenes)) senseAmplitudes <- rep(c(seq(0, 1, by=0.25), seq(1.5, 3, by=0.5)), each=50) senseMat <- randomMatrixButOneSignature(rows=humanGenes, signatureGenes=selSignature, amplitudes=senseAmplitudes) senseMatOutlier = addNoise(senseMat, stdv=15, fractionAffected=.01) compare.tests = function(senseMat) { senseBioQC <- wmwTest(senseMat, tissueInds, valType="p.greater", simplify=TRUE)[selGeneSet,] senseTTest <- do.test(senseMat, tissueInds[[selGeneSet]], test=t.test) senseKS <- do.test(senseMat, tissueInds[[selGeneSet]], test=function(x, y, alternative) { ks.test(y, x, alternative=alternative)}) comp = data.frame(BioQC=-log10(senseBioQC), t.test=-log10(senseTTest), ks.test=-log10(senseKS), senseAmplitudes=senseAmplitudes) comp.molten = melt(comp, id="senseAmplitudes") comp.summary = summarySE(comp.molten, measurevar="value", groupvars=c("senseAmplitudes", "variable")) return(comp.summary) } comp.summary = compare.tests(senseMat) comp.summary.outlier = compare.tests(senseMatOutlier) plot.comp = ggplot(comp.summary, aes(x=senseAmplitudes, y=value, color=variable)) + geom_line() + geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.1) + geom_point() + xlab("Mean expression differnce") + ylab("Enrichment score") + ggtitle("without noise") + theme_bw() + scale_color_brewer(palette="Dark2") plot.comp.outlier = ggplot(comp.summary.outlier, aes(x=senseAmplitudes, y=value, color=variable)) + geom_line() + geom_errorbar(aes(ymin=value-se, ymax=value+se), width=.1) + geom_point() + xlab("Mean expression differnce") + ylab("Enrichment score") + ggtitle("with noise") + theme_bw() + scale_color_brewer(palette="Dark2") grid.arrange(plot.comp, plot.comp.outlier, ncol=2) ## ----benchmark, include=FALSE------------------------------------------------- runWMW = function() {return(wmwTest(senseMat, tissueInds, valType="p.greater", simplify=TRUE))} runKS = function() {return(do.test(senseMat, tissueInds[[selGeneSet]], test=function(x, y, alternative) { ks.test(y, x, alternative=alternative)}))} benchmark.res = benchmark(runWMW(), runKS(), columns=c("test", "replications", "elapsed", "relative"), replications=5) ## ----benchmark_res, echo=FALSE------------------------------------------------ benchmark.res ## ----session_info------------------------------------------------------------- sessionInfo()