## ----echo=FALSE,results="hide"--------------------------------------------- date() ## ----echo=FALSE,results="hide"--------------------------------------------- scoresCis = function(...){NULL} # does commented unevald code get checked? suppressMessages({ # include some warnings on symbol replacements if (!"BiocManager" %in% rownames(installed.packages())) install.packages("BiocManager") if (!("GGdata" %in% installed.packages()[,1])) { install("GGdata") } if (!("SNPlocs.Hsapiens.dbSNP144.GRCh37" %in% installed.packages()[,1])) { install("SNPlocs.Hsapiens.dbSNP144.GRCh37") } library(knitcitations) library(bibtex) allbib = read.bibtex("allbib.bib") library(GenomeInfoDb) library(S4Vectors) library(GGtools) library(GGdata) library(yri1kgv) library(snpStats) library(scatterplot3d) library(lumi) library(parallel) library(foreach) library(doParallel) library(biglm) library(lumiHumanAll.db) library(rmeta) library(SNPlocs.Hsapiens.dbSNP144.GRCh37) library(eQTL) }) ## ----dolo,echo=FALSE------------------------------------------------------- cc = new("CisConfig") chrnames(cc) = "21" radius(cc) = 25000L lkp = try(library(parallel)) if (!inherits(lkp, "try-error")) { nc = 2 # max(c(1,min(c(10, detectCores()-1)))) # attempt to trim for workflow builder options(mc.cores=nc) geneApply(cc) = mclapply if (.Platform$OS.type == "windows") geneApply(cc) = lapply } estimates(cc) = FALSE set.seed(1234) f1 <- All.cis( cc ) # devel: cisScores ## ----lkex,eval=FALSE------------------------------------------------------- # cc = new("CisConfig") # take a default configuration # chrnames(cc) = "21" # confine to chr21 # estimates(cc) = FALSE # no point estimates neede # f1 <- All.cis( cc ) # compute the tests; can be slow without attendance # # to parallelization ## ----lookf1,echo=TRUE------------------------------------------------------ length(f1) f1[1:3] metadata(f1) ## ----demoy,fig=TRUE,fig.width=7,fig.height=4,echo=FALSE,results="hide"----- suppressMessages({ library(yri1kgv) if (!exists("c20")) c20 = getSS("yri1kgv", "chr20") par(mfrow=c(1,2)) plot_EvG(probeId("o67h4JQSuEa02CJJIQ"), rsid("rs2259928"), c20, main="observed expr.") if (!exists("c20f")) c20f = clipPCs(c20, 1:10) plot_EvG(probeId("o67h4JQSuEa02CJJIQ"), rsid("rs2259928"), c20f, main="10 expr. PC removed") }) ## ----shoit,eval=FALSE------------------------------------------------------ # plot_EvG(probeId("o67h4JQSuEa02CJJIQ"), rsid("rs2259928"), c20f, # main="10 expr. PC removed") ## ----bag,fig=TRUE,fig.width=4,fig.height=4,echo=FALSE---------------------- library(snpStats) library(scatterplot3d) tmp = as.raw(1:253) yy = g2post(tmp) EB = yy %*% c(0,1,2) scatterplot3d(yy[,1], yy[,3], EB, xlab="Pr(A/A)", ylab="Pr(B/B)", zlab="mean num. B") ## ----bag2------------------------------------------------------------------ library(GGtools) library(yri1kgv) library(lumiHumanAll.db) if (!exists("y22")) y22 = getSS("yri1kgv", "chr22") y22 dim(exprs(y22)) fn = featureNames(y22)[1:5] ## ----getseq---------------------------------------------------------------- library(lumi) id2seq(fn) # get the 50mer for each probe # and some annotation ## ----getann---------------------------------------------------------------- select( lumiHumanAll.db, keys=fn, keytype="PROBEID", columns=c("SYMBOL", "CHR", "ENTREZID")) ## ----getgen---------------------------------------------------------------- gt22 <- smList(y22)[[1]] # access to genotypes as( gt22[1:5,1:5], "character" ) cs22 = col.summary(gt22) # some information on genotypes cs22[1:10,] ## ----showscript,eval=FALSE------------------------------------------------- # library(parallel) # newcl = makePSOCKcluster(c("master", paste0("node00", 1:3))) # library(foreach) # library(doParallel) # registerDoParallel(cores=8) # may want to keep at 5 # # library(GGtools) # ceuDemoRecov = try(ciseqByCluster( newcl, # chromsToRun=19:22, finaltag="partceu100k", # outprefix="ceurun", # ncoresPerNode=8, targetfolder="/freshdata/CEU_DEMO" )) # save(ceuDemoRecov, file="ceuDemoRecov.rda") # stopCluster(newcl) # stopImplicitCluster() # sessionInfo() ## ----lkqq,eval=FALSE------------------------------------------------------- # binnedQQ(partceu100k_dt, ylim=c(0,30), xlim=c(-2,15), end45=12) ## ----co,echo=FALSE--------------------------------------------------------- update_fdr_filt = function (tab, filt = function(x) x, by = c("pairs", "snps", "probes")[1]) { require(GGtools, quietly = TRUE) tab = filt(tab) psinds = grep("permScore", colnames(tab), value = TRUE) nr = nrow(tab) pscores = vector("numeric", nr * length(psinds)) for (np in 1:length(psinds)) pscores[(((np - 1) * nr) + 1):(np * nr)] = tab[[psinds[np]]] if (by == "pairs") { newfd = pifdr(tab$score, pscores) } else { if (by == "snps") byvar = "snp" else if (by == "probes") byvar = "probeid" base = tab[, max(score), by = byvar] maxbysnp = base$V1 ol = list() pnames = grep("permScore", names(tab)) for (i in 1:length(pnames)) { tab$score = tab[, pnames[i], with = FALSE] ol[[i]] = tab[, max(score), by = byvar]$V1 } newfd = pifdr(maxbysnp, as.numeric(unlist(ol))) tab = base } tab$fdr = newfd tab } # # defining here so that release version can be used # filtgen.maf.dist = function (maf.dist, validate.tab = function(tab) all(c("mindist", "MAF") %in% colnames(tab))) { stopifnot(is.atomic(maf.dist)) stopifnot(length(maf.dist) == 2) maf = maf.dist[1] dist = maf.dist[2] function(tab) { stopifnot(isTRUE(validate.tab(tab))) tab[tab$mindist <= dist & tab$MAF >= maf, ] } } eqsens_dt = function (dtab, filtgen = filtgen.maf.dist, by = c("pairs", "snps", "probes")[1], targfdrs = c(0.05, 0.01, 0.005), parmslist = list(mafs = c(0.025, 0.05, 0.075, 0.1, 0.125), dists = c(1000, 5000, 10000, 25000, 50000, 1e+05))) { parmset = data.matrix(do.call(expand.grid, parmslist)) ntune = nrow(parmset) ans = foreach(curp = 1:ntune) %dopar% { tmp = update_fdr_filt(tab = dtab, filt = filtgen(parmset[curp, ]), by = by) sapply(targfdrs, function(x) sum(tmp$fdr <= x)) } hold = t(sapply(ans, force)) colnames(hold) = paste0("at_", targfdrs) cbind(parmset, hold) } filtgen.maf.dist = function (maf.dist, validate.tab = function(tab) all(c("mindist", "MAF") %in% colnames(tab))) { stopifnot(is.atomic(maf.dist)) stopifnot(length(maf.dist) == 2) maf = maf.dist[1] dist = maf.dist[2] function(tab) { stopifnot(isTRUE(validate.tab(tab))) tab[tab$mindist <= dist & tab$MAF >= maf, ] } } plotsens = function (eqsout, ylab = "count of eQTL at given FDR", title = "cis radius in bp") { require(reshape2) require(ggplot2) mdf = melt(data.frame(eqsout), id.vars = c("mafs", "dists")) vind = which(names(mdf) == "variable") names(mdf)[vind] = "FDR" mdf[, vind] = gsub("at_", "", mdf[, vind]) ggplot(data = mdf) + geom_point(aes(x = mafs, y = value, colour = FDR)) + facet_grid(~dists) + theme(axis.text.x = element_text(angle = 90)) + ylab(ylab) + labs(title = title) } ## ----dosens,fig=TRUE------------------------------------------------------- data(partceu100k_dt) library(foreach) # basic function includes a %dopar% library(doParallel) library(parallel) # dcdown = max(c(detectCores()-1,1)) ## use with lots of RAM dcdown = 1 registerDoSEQ() if (.Platform$OS.type != "windows") { #cl = makeForkCluster(dcdown) registerDoParallel(cores=dcdown) # nesting? } ## ----doeq,eval=FALSE------------------------------------------------------- # eq1 = eqsens_dt(partceu100k_dt) ## ----nnn,echo=FALSE-------------------------------------------------------- data(sensSave) eq1 = sensSave ## ----lkar------------------------------------------------------------------ args(eqsens_dt) ## ----coded,eval=FALSE------------------------------------------------------ # library(data.table) # data("partceu100k_dt.rda") # scoresCis("CPNE1", partceu100k_dt) ## ----disc------------------------------------------------------------------ distcat = cut(partceu100k_dt$mindist,c(-1, 1, 1000, 5000, 10000, 50000, 100001)) fdrcat = cut(partceu100k_dt$fdr,c(-.01,.005, .05, .1, .2, 1.01)) fdrcat = relevel(fdrcat, "(0.2,1.01]") mafcat = cut(partceu100k_dt$MAF,c(0,.05, .1, .2, .3, .51)) approm = 1*partceu100k_dt$chromcat878 %in% c("1_Active_Promoter", "3_Poised_Promoter") ## ----fit------------------------------------------------------------------- partceu100k_dt = cbind(partceu100k_dt, distcat, fdrcat, mafcat, approm) set.seed(1234) train = sample(1:nrow(partceu100k_dt), size=floor(nrow(partceu100k_dt)/2), replace=FALSE) library(biglm) b1 = bigglm(isgwashit~distcat+fdrcat+mafcat+approm, fam=binomial(), data=partceu100k_dt[train,], maxit=30) ## ----cali------------------------------------------------------------------ pp = predict(b1, newdata=partceu100k_dt[-train,], type="response") summary(pp) cpp = cut(pp, c(0,.025, .05, .12, .21)) table(cpp) sapply(split(partceu100k_dt$isgwashit[-train], cpp), mean) ## ----demomodco,fig=TRUE,fig.width=7,fig.height=4--------------------------- tmat = matrix(rownames(summary(b1)$mat),nc=1) est = summary(b1)$mat[,1] library(rmeta) forestplot(tmat, est, est-.01, est+.01, xlog=TRUE, boxsize=.35, graphwidth=unit(3, "inches"), xticks=exp(seq(-4,2,2))) ## ----cleanup, echo=FALSE--------------------------------------------------- rm(list=ls()) gc() ## ----results='asis',echo=FALSE--------------------------------------------- bibliography() #style="markdown") ## ----sess------------------------------------------------------------------ sessionInfo()