## ----setup, echo=FALSE-------------------------------------------------------- options(width=80) ## ----library_install, message=FALSE, cache=FALSE, eval=FALSE------------------ # install.packages("BiocManager") # BiocManager::install("GenomicScores") ## ----library_upload, message=FALSE, warning=FALSE, cache=FALSE---------------- library(GenomicScores) ## ---- message=FALSE, warning=FALSE, cache=FALSE------------------------------- library(phastCons100way.UCSC.hg19) phast <- phastCons100way.UCSC.hg19 class(phast) ## ----------------------------------------------------------------------------- gscores(phast, GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1))) ## ----------------------------------------------------------------------------- phast ## ----------------------------------------------------------------------------- citation(phast) ## ----------------------------------------------------------------------------- provider(phast) providerVersion(phast) organism(phast) seqlevelsStyle(phast) ## ---- echo=FALSE-------------------------------------------------------------- avgs <- readRDS(system.file("extdata", "avgs.rds", package="GenomicScores")) ## ----retrieve2, message=FALSE, cache=FALSE, eval=FALSE------------------------ # availableGScores() ## ---- echo=FALSE-------------------------------------------------------------- avgs ## ----retrieve3, message=FALSE, cache=FALSE, eval=FALSE------------------------ # phast <- getGScores("phastCons100way.UCSC.hg19") ## ----retrieve4, message=FALSE, cache=FALSE------------------------------------ gscores(phast, GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1))) ## ----eval=FALSE--------------------------------------------------------------- # makeGScoresPackage(phast, maintainer="Me ", author="Me", version="1.0.0") ## ----echo=FALSE--------------------------------------------------------------- cat("Creating package in ./phastCons100way.UCSC.hg19\n") ## ---- echo=FALSE-------------------------------------------------------------- obj <- readRDS(system.file("extdata", "mcap.v1.0.hg19.chr22.rds", package="GenomicScores")) mdobj <- metadata(obj) mcap <- GScores(provider=mdobj$provider, provider_version=mdobj$provider_version, download_url=mdobj$download_url, download_date=mdobj$download_date, reference_genome=mdobj$reference_genome, data_pkgname=mdobj$data_pkgname, data_dirpath="../inst/extdata", data_serialized_objnames=c(mcap.v1.0.hg19.chr22.rds="mcap.v1.0.hg19.chr22.rds"), data_tag="MCAP") gscopops <- get(mdobj$data_pkgname, envir=mcap@.data_cache) gscopops[["default"]] <- RleList(compress=FALSE) gscopops[["default"]][[mdobj$seqname]] <- obj assign(mdobj$data_pkgname, gscopops, envir=mcap@.data_cache) ## ---- eval=FALSE-------------------------------------------------------------- # mcap <- getGScores("mcap.v1.0.hg19") ## ----------------------------------------------------------------------------- mcap citation(mcap) gr <- GRanges(seqnames="chr22", IRanges(50967020:50967025, width=1)) gscores(mcap, gr) ## ---- message=FALSE----------------------------------------------------------- library(BSgenome.Hsapiens.UCSC.hg19) refAlleles <- as.character(getSeq(Hsapiens, gr)) altAlleles <- DNA_BASES[(match(refAlleles, DNA_BASES)) %% 4 + 1] cbind(REF=refAlleles, ALT=altAlleles) gscores(mcap, gr, ref=refAlleles, alt=altAlleles) ## ----------------------------------------------------------------------------- gr1 <- GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1)) gr1sco <- gscores(phast, gr1) gr1sco mean(gr1sco$default) gr2 <- GRanges("chr22:50967020-50967025") gscores(phast, gr2) ## ----------------------------------------------------------------------------- gscores(phast, gr2, summaryFun=max) gscores(phast, gr2, summaryFun=min) gscores(phast, gr2, summaryFun=median) ## ----------------------------------------------------------------------------- phastqfun <- qfun(phast) phastqfun phastdqfun <- dqfun(phast) phastdqfun ## ----------------------------------------------------------------------------- gr1sco <- gscores(phast, gr1, quantized=TRUE) gr1sco ## ----------------------------------------------------------------------------- phastdqfun(gr1sco$default) ## ----------------------------------------------------------------------------- populations(phast) ## ----------------------------------------------------------------------------- defaultPopulation(phast) ## ----------------------------------------------------------------------------- gscores(phast, gr1, pop="DP2") qfun(phast, pop="DP2") dqfun(phast, pop="DP2") ## ----------------------------------------------------------------------------- defaultPopulation(phast) <- "DP2" phast gscores(phast, gr1) qfun(phast) dqfun(phast) ## ---- message=FALSE----------------------------------------------------------- library(MafDb.1Kgenomes.phase1.hs37d5) mafdb <- MafDb.1Kgenomes.phase1.hs37d5 mafdb populations(mafdb) ## ---- message=FALSE----------------------------------------------------------- library(gwascat) data(ebicat37) eyersids <- unique(subsetByTraits(ebicat37, tr="Eye color")$SNPS) eyersids ## ---- message=FALSE----------------------------------------------------------- library(SNPlocs.Hsapiens.dbSNP144.GRCh37) rng <- snpsById(SNPlocs.Hsapiens.dbSNP144.GRCh37, ids=eyersids) rng ## ----------------------------------------------------------------------------- eyecolormafs <- gscores(mafdb, rng, pop=c("AF", "EUR_AF", "AFR_AF")) eyecolormafs ## ----------------------------------------------------------------------------- gscores(mafdb, eyersids, pop=c("AF", "EUR_AF", "AFR_AF")) ## ----eyecolormafs, fig.cap = "Eye color MAFs. Minor allele frequencies (MAFs) of variants associated with eye color for global, european and african populations of the Phase 1 data from the 1000 Genomes Project.", fig.height=5, fig.wide = TRUE, echo=TRUE---- par(mar=c(3, 5, 1, 1)) mafpops <- c("AF", "EUR_AF", "AFR_AF") bp <- barplot(t(as.matrix(mcols(eyecolormafs)[, mafpops])), beside=TRUE, col=c("darkblue", "darkgreen", "darkorange"), ylim=c(0, 0.55), las=1, ylab="Minor allele frequency") axis(1, bp[2, ], 1:length(eyecolormafs)) mtext("Variant", side=1, line=2) legend("topright", mafpops, fill=c("darkblue", "darkgreen", "darkorange")) ## ---- message=FALSE----------------------------------------------------------- library(VariantAnnotation) library(TxDb.Hsapiens.UCSC.hg19.knownGene) ## ----------------------------------------------------------------------------- fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation") vcf <- readVcf(fl, "hg19") seqlevelsStyle(vcf) txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene seqlevelsStyle(txdb) ## ----------------------------------------------------------------------------- seqlevelsStyle(vcf) <- seqlevelsStyle(txdb) ## ---- message=FALSE----------------------------------------------------------- loc <- locateVariants(vcf, txdb, AllVariants()) loc[1:3] table(loc$LOCATION) ## ----------------------------------------------------------------------------- loc$PHASTCONS <- score(phast, loc) loc[1:3] ## ----plot1, fig.cap = "Distribution of phastCons conservation scores in variants across different annotated regions. Diamonds indicate mean values.", echo = FALSE, fig.height=5, fig.wide = TRUE, echo=TRUE---- x <- split(loc$PHASTCONS, loc$LOCATION) mask <- elementNROWS(x) > 0 boxplot(x[mask], ylab="phastCons score", las=1, cex.axis=1.2, cex.lab=1.5, col="gray") points(1:length(x[mask])+0.25, sapply(x[mask], mean, na.rm=TRUE), pch=23, bg="black") ## ----------------------------------------------------------------------------- maskSNVs <- isSNV(vcf)[loc$QUERYID] loc$MCAP <- rep(NA_real_, length(loc)) loc$MCAP[maskSNVs] <- score(mcap, loc[maskSNVs], ref=ref(vcf)[loc$QUERYID[maskSNVs]], alt=alt(vcf)[loc$QUERYID[maskSNVs]]) ## ---- echo=FALSE-------------------------------------------------------------- obj <- readRDS(system.file("extdata", "cadd.v1.3.hg19.chr22.rds", package="GenomicScores")) mdobj <- metadata(obj) cadd <- GScores(provider=mdobj$provider, provider_version=mdobj$provider_version, download_url=mdobj$download_url, download_date=mdobj$download_date, reference_genome=mdobj$reference_genome, data_pkgname=mdobj$data_pkgname, data_dirpath="../inst/extdata", data_serialized_objnames=c(cadd.v1.3.hg19.chr22.rds="cadd.v1.3.hg19.chr22.rds"), data_tag="CADD") gscopops <- get(mdobj$data_pkgname, envir=cadd@.data_cache) gscopops[["default"]] <- RleList(compress=FALSE) gscopops[["default"]][[mdobj$seqname]] <- obj assign(mdobj$data_pkgname, gscopops, envir=cadd@.data_cache) ## ----------------------------------------------------------------------------- maskSNVs <- isSNV(vcf)[loc$QUERYID] loc$CADD <- rep(NA_real_, length(loc)) loc$CADD[maskSNVs] <- score(cadd, loc[maskSNVs], ref=ref(vcf)[loc$QUERYID[maskSNVs]], alt=alt(vcf)[loc$QUERYID[maskSNVs]]) ## ----mcapvscadd, fig.cap = "Comparison of M-CAP and CADD scores. Values on the x- and y-axis are jittered to facilitate visualization.", echo = FALSE, fig.height=5, fig.width=7, dpi=100, echo=TRUE---- library(RColorBrewer) par(mar=c(4, 5, 1, 1)) hmcol <- colorRampPalette(brewer.pal(nlevels(loc$LOCATION), "Set1"))(nlevels(loc$LOCATION)) plot(jitter(loc$MCAP, factor=2), jitter(loc$CADD, factor=2), pch=19, col=hmcol, xlab="M-CAP scores", ylab="CADD scores", las=1, cex.axis=1.2, cex.lab=1.5, panel.first=grid()) legend("bottomright", levels(loc$LOCATION), pch=19, col=hmcol, inset=0.01) ## ----------------------------------------------------------------------------- seqlevelsStyle(loc) <- seqlevelsStyle(mafdb)[1] seqinfo(loc, new2old=match(seqlevels(mafdb), seqlevels(loc))) <- seqinfo(mafdb) maskSNVs <- isSNV(vcf)[loc$QUERYID] loc$MAF[maskSNVs] <- gscores(mafdb, loc[maskSNVs])$AF loc$MAF[!maskSNVs] <- gscores(mafdb, loc[!maskSNVs], type="nonsnrs")$AF loc[1:3] ## ----------------------------------------------------------------------------- afs <- DataFrame(matrix(NA, nrow=length(loc), ncol=2, dimnames=list(NULL, c("AF_REF", "AF_ALT")))) afs[maskSNVs, ] <- score(mafdb, loc[maskSNVs], ref=ref(vcf)[loc$QUERYID[maskSNVs]], alt=alt(vcf)[loc$QUERYID[maskSNVs]]) afs[!maskSNVs, ] <- score(mafdb, loc[!maskSNVs], ref=ref(vcf)[loc$QUERYID[!maskSNVs]], alt=alt(vcf)[loc$QUERYID[!maskSNVs]], type="nonsnrs") mcols(loc) <- cbind(mcols(loc), afs) loc[1:3] ## ----showpositions, message=FALSE, cache=FALSE-------------------------------- origpcscoCDS <- readRDS(system.file("extdata", "origphastCons100wayhg19CDS.rds", package="GenomicScores")) origpcscoCDS length(unique(origpcscoCDS$score)) ## ----------------------------------------------------------------------------- numDecimals <- function(x) { spl <- strsplit(as.character(x+1), "\\.") spl <- sapply(spl, "[", 2) spl[is.na(spl)] <- "" nchar(spl) } nd1 <- numDecimals(origpcscoCDS$score) table(nd1) ## ----showpositions2, message=FALSE, cache=FALSE------------------------------- origpcsco3UTRs <- readRDS(system.file("extdata", "origphastCons100wayhg193UTR.rds", package="GenomicScores")) origpcsco3UTRs length(table(origpcsco3UTRs$score)) nd2 <- numDecimals(origpcsco3UTRs$score) table(nd2) ## ----------------------------------------------------------------------------- defaultPopulation(phast) <- "default" phast ## ----------------------------------------------------------------------------- pkgpcscoCDS <- score(phast, origpcscoCDS) pkgpcsco3UTRs <- score(phast, origpcsco3UTRs) ## ----plot2, fig.height=9, fig.width=8, dpi=100, fig.cap = "Original and lossy-compressed phastCons scores. Top panels (a, b): comparison of the distribution of values. Bottom panels (c, d): comparison of the cumulative distribution", echo = FALSE---- labelPanel <- function(lab, font=2, cex=2, offsetx=0.05, offsety=0.05) { par(xpd=TRUE) w <- par("usr")[2] - par("usr")[1] h <- par("usr")[4] - par("usr")[3] text(par("usr")[1]-w*offsetx, par("usr")[4]+h*offsety, lab, font=font, cex=cex) par(xpd=FALSE) } par(mfrow=c(2, 2), mar=c(4, 5, 2, 1)) plot(origpcscoCDS$score, jitter(pkgpcscoCDS), pch=19, cex=1, cex.lab=1.2, xaxt="n", yaxt="n", xlab="Original phastCons scores (CDS)", ylab="Compressed phastCons scores (CDS)") axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") abline(0, 1) labelPanel(letters[1]) plot(origpcsco3UTRs$score, jitter(pkgpcsco3UTRs), pch=19, cex=1, cex.lab=1.2, xaxt="n", yaxt="n", xlab="Original phastCons scores (3' UTR)", ylab="Compressed phastCons scores (3' UTR)") axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") abline(0, 1) labelPanel(letters[2]) ForigCDS <- ecdf(origpcscoCDS$score) FpkgCDS <- ecdf(pkgpcscoCDS) plot(sort(origpcscoCDS$score), ForigCDS(sort(origpcscoCDS$score)), xaxt="n", yaxt="n", cex.lab=1.2, pch=".", cex=4, xlab="phastCons scores (CDS)", ylab="F(x)", ylim=c(0, 1)) axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") points(sort(pkgpcscoCDS), FpkgCDS(sort(pkgpcscoCDS)), pch=19, cex=1) legend("topleft", c("Original score", "Rounded score"), pch=c(46, 19), pt.cex=c(4, 1), inset=0.01, bg="white") labelPanel(letters[3]) Forig3UTRs <- ecdf(origpcsco3UTRs$score) Fpkg3UTRs <- ecdf(pkgpcsco3UTRs) plot(sort(origpcsco3UTRs$score), Forig3UTRs(sort(origpcsco3UTRs$score)), xaxt="n", yaxt="n", cex.lab=1.2, pch=".", cex=4, xlab="phastCons scores (3'UTR)", ylab="F(x)", ylim=c(0, 1)) axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1) abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray") points(sort(pkgpcsco3UTRs), Fpkg3UTRs(sort(pkgpcsco3UTRs)), pch=19, cex=1) legend("topleft", c("Original score", "Rounded score"), pch=c(46, 19), pt.cex=c(4, 1), inset=0.01, bg="white") labelPanel(letters[4]) ## ----session_info, cache=FALSE------------------------------------------------ sessionInfo()