## ---- echo=FALSE-------------------------------------------------------------- library(knitr) opts_chunk$set(cache=FALSE, error=FALSE) ## ----message=FALSE------------------------------------------------------------ library(alpineData) dir <- system.file("extdata",package="alpineData") metadata <- read.csv(file.path(dir,"metadata.csv"), stringsAsFactors=FALSE) metadata[,c("Title","Performer","Date","Population")] ## ----message=FALSE------------------------------------------------------------ library(GenomicAlignments) ERR188297() ## ----message=FALSE------------------------------------------------------------ library(rtracklayer) dir <- tempdir() for (sample.name in metadata$Title) { # the reads are accessed with functions named # after the sample name. the following line calls # the function with the sample name and saves # the reads to `gap` gap <- match.fun(sample.name)() file.name <- file.path(dir,paste0(sample.name,".bam")) export(gap, con=file.name) } bam.files <- file.path(dir, paste0(metadata$Title, ".bam")) names(bam.files) <- metadata$Title stopifnot(all(file.exists(bam.files))) ## ---- eval=FALSE-------------------------------------------------------------- # library(ensembldb) # gtf.file <- "Homo_sapiens.GRCh38.84.gtf" # txdb <- EnsDb(gtf.file) # already an EnsDb # txdf <- transcripts(txdb, return.type="DataFrame") # tab <- table(txdf$gene_id) # one.iso.genes <- names(tab)[tab == 1] # # pre-selected genes based on medium to high counts # # calculated using Rsubread::featureCounts # selected.genes <- scan("selected.genes.txt", what="char") # one.iso.txs <- txdf$tx_id[txdf$gene_id %in% # intersect(one.iso.genes, selected.genes)] # ebt0 <- exonsBy(txdb, by="tx") # ebt.fit <- ebt0[one.iso.txs] ## ----message=FALSE------------------------------------------------------------ library(GenomicRanges) ## ----------------------------------------------------------------------------- library(alpine) data(preprocessedData) # filter small genes and long genes min.bp <- 600 max.bp <- 7000 gene.lengths <- sum(width(ebt.fit)) summary(gene.lengths) ebt.fit <- ebt.fit[gene.lengths > min.bp & gene.lengths < max.bp] length(ebt.fit) set.seed(1) # better to use ~100 genes ebt.fit <- ebt.fit[sample(length(ebt.fit),10)] ## ----------------------------------------------------------------------------- w <- getFragmentWidths(bam.files[1], ebt.fit[[1]]) c(summary(w), Number=length(w)) quantile(w, c(.025, .975)) ## ----------------------------------------------------------------------------- getReadLength(bam.files) ## ----message=FALSE------------------------------------------------------------ library(alpine) library(BSgenome.Hsapiens.NCBI.GRCh38) readlength <- 75 minsize <- 125 # better 80 for this data maxsize <- 175 # better 350 for this data gene.names <- names(ebt.fit) names(gene.names) <- gene.names ## ----buildFragtype------------------------------------------------------------ system.time({ fragtypes <- lapply(gene.names, function(gene.name) { buildFragtypes(exons=ebt.fit[[gene.name]], genome=Hsapiens, readlength=readlength, minsize=minsize, maxsize=maxsize, gc.str=FALSE) }) }) print(object.size(fragtypes), units="auto") ## ----------------------------------------------------------------------------- head(fragtypes[[1]], 3) ## ----------------------------------------------------------------------------- models <- list( "GC" = list( formula = "count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) + gene", offset=c("fraglen") ), "all" = list( formula = "count ~ ns(gc,knots=gc.knots,Boundary.knots=gc.bk) + ns(relpos,knots=relpos.knots,Boundary.knots=relpos.bk) + gene", offset=c("fraglen","vlmm") ) ) ## ----fitBiasModels------------------------------------------------------------ system.time({ fitpar <- lapply(bam.files, function(bf) { fitBiasModels(genes=ebt.fit, bam.file=bf, fragtypes=fragtypes, genome=Hsapiens, models=models, readlength=readlength, minsize=minsize, maxsize=maxsize) }) }) # this object saved was 'fitpar.small' for examples in alpine man pages # fitpar.small <- fitpar ## ----------------------------------------------------------------------------- library(RColorBrewer) palette(brewer.pal(8,"Dark2")) ## ----fraglen------------------------------------------------------------------ perf <- as.integer(factor(metadata$Performer)) plotFragLen(fitpar, col=perf) ## ----gccurve------------------------------------------------------------------ plotGC(fitpar, model="all", col=perf) ## ----relpos------------------------------------------------------------------- plotRelPos(fitpar, model="all", col=perf) ## ----vlmm--------------------------------------------------------------------- plotOrder0(fitpar[["ERR188297"]][["vlmm.fivep"]][["order0"]]) plotOrder0(fitpar[["ERR188297"]][["vlmm.threep"]][["order0"]]) ## ----------------------------------------------------------------------------- print(head(fitpar[["ERR188297"]][["summary"]][["all"]]), row.names=FALSE) ## ---- eval=FALSE-------------------------------------------------------------- # one.iso.genes <- intersect(names(tab)[tab == 1], selected.genes) # two.iso.genes <- intersect(names(tab)[tab == 2], selected.genes) # three.iso.genes <- intersect(names(tab)[tab == 3], selected.genes) # set.seed(1) # genes.theta <- c(sample(one.iso.genes, 2), # sample(two.iso.genes, 2), # sample(three.iso.genes, 2)) # txdf.theta <- txdf[txdf$gene_id %in% genes.theta,] # ebt.theta <- ebt0[txdf.theta$tx_id] ## ----------------------------------------------------------------------------- model.names <- c("null","fraglen.vlmm","GC") ## ----estimateAbundance-------------------------------------------------------- system.time({ res <- lapply(genes.theta, function(gene.name) { txs <- txdf.theta$tx_id[txdf.theta$gene_id == gene.name] estimateAbundance(transcripts=ebt.theta[txs], bam.files=bam.files, fitpar=fitpar, genome=Hsapiens, model.names=model.names) }) }) ## ----------------------------------------------------------------------------- res[[1]][["ERR188297"]][["GC"]] res[[6]][["ERR188297"]][["GC"]] ## ----------------------------------------------------------------------------- mat <- extractAlpine(res, model="GC") mat ## ----------------------------------------------------------------------------- se <- extractAlpine(res, model="GC", transcripts=ebt.theta) se ## ---- eval=FALSE-------------------------------------------------------------- # norm.mat <- normalizeDESeq(mat, cutoff=0.1) ## ----------------------------------------------------------------------------- data(preprocessedData) prob.mat <- plotGC(fitpar, "all", return.type=2) head(prob.mat) ## ----------------------------------------------------------------------------- model.names <- c("fraglen","fraglen.vlmm","GC","all") ## ----------------------------------------------------------------------------- fitpar[[1]][["model.params"]][c("minsize","maxsize")] ## ----predictCoverage---------------------------------------------------------- system.time({ pred.cov <- predictCoverage(gene=ebt.fit[["ENST00000245479"]], bam.files=bam.files["ERR188204"], fitpar=fitpar, genome=Hsapiens, model.names=model.names) }) ## ----------------------------------------------------------------------------- palette(brewer.pal(9, "Set1")) frag.cov <- pred.cov[["ERR188204"]][["frag.cov"]] plot(frag.cov, type="l", lwd=3, ylim=c(0,max(frag.cov)*1.5)) for (i in seq_along(model.names)) { m <- model.names[i] pred <- pred.cov[["ERR188204"]][["pred.cov"]][[m]] lines(pred, col=i, lwd=3) } legend("topright", legend=c("observed",model.names), col=c("black",seq_along(model.names)), lwd=3) ## ----------------------------------------------------------------------------- sessionInfo()