## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"------------------ BiocStyle::latex() ## ----setup, echo=FALSE--------------------------------------------------- library(knitr) opts_chunk$set(dev="pdf", fig.align="center", cache=FALSE, message=FALSE, out.width=".55\\textwidth", echo=TRUE, results="markup", fig.show="hold") options(width=60) ## ----data---------------------------------------------------------------- require(EDASeq) require(yeastRNASeq) require(leeBamViews) ## ----import-raw---------------------------------------------------------- files <- list.files(file.path(system.file(package = "yeastRNASeq"), "reads"), pattern = "fastq", full.names = TRUE) names(files) <- gsub("\\.fastq.*", "", basename(files)) met <- DataFrame(conditions=c(rep("mut",2), rep("wt",2)), row.names=names(files)) fastq <- FastqFileList(files) elementMetadata(fastq) <- met fastq ## ----import-aligned------------------------------------------------------ files <- list.files(file.path(system.file(package = "leeBamViews"), "bam"), pattern = "bam$", full.names = TRUE) names(files) <- gsub("\\.bam", "", basename(files)) gt <- gsub(".*/", "", files) gt <- gsub("_.*", "", gt) lane <- gsub(".*(.)$", "\\1", gt) geno <- gsub(".$", "", gt) pd <- DataFrame(geno=geno, lane=lane, row.names=paste(geno,lane,sep=".")) bfs <- BamFileList(files) elementMetadata(bfs) <- pd bfs ## ----plot-total, fig.cap="Per-lane number of mapped reads and quality scores", fig.subcap=c("Number of mapped reads", "Mean per-base quality of mapped reads"), out.width='.49\\linewidth'---- colors <- c(rep(rgb(1,0,0,alpha=0.7),2), rep(rgb(0,0,1,alpha=0.7),2), rep(rgb(0,1,0,alpha=0.7),2), rep(rgb(0,1,1,alpha=0.7),2)) barplot(bfs,las=2,col=colors) plotQuality(bfs,col=colors,lty=1) legend("topright",unique(elementMetadata(bfs)[,1]), fill=unique(colors)) ## ----plot-qual, fig.cap="Quality scores and number of mapped reads for lane 'isowt5\\_13e.'", fig.subcap=c("Per-base quality of mapped reads", "Number of mapped reads per-chromosome"), out.width='.49\\linewidth'---- plotQuality(bfs[[1]],cex.axis=.8) barplot(bfs[[1]],las=2) ## ----plot-nt, fig.cap="Per-base nucleotide frequencies of mapped reads for lane 'isowt5\\_13e.'", out.width='.49\\linewidth'---- plotNtFrequency(bfs[[1]]) ## ----load-gene-level----------------------------------------------------- data(geneLevelData) head(geneLevelData) ## ----load-lgc------------------------------------------------------------ data(yeastGC) head(yeastGC) data(yeastLength) head(yeastLength) ## ----filter-------------------------------------------------------------- filter <- apply(geneLevelData,1,function(x) mean(x)>10) table(filter) common <- intersect(names(yeastGC), rownames(geneLevelData[filter,])) length(common) ## ----create-object------------------------------------------------------- feature <- data.frame(gc=yeastGC,length=yeastLength) data <- newSeqExpressionSet(counts=as.matrix(geneLevelData[common,]), featureData=feature[common,], phenoData=data.frame( conditions=c(rep("mut",2),rep("wt",2)), row.names=colnames(geneLevelData))) data ## ----show-data----------------------------------------------------------- head(counts(data)) pData(data) head(fData(data)) ## ----show-offset--------------------------------------------------------- head(offst(data)) ## ----boxplot-genelevel, fig.cap="Between-lane distribution of gene-level counts (log)."---- boxplot(data,col=colors[1:4]) ## ----md-plot, fig.cap="Mean-difference plot of the gene-level counts (log) of lanes 'mut\\_1' and 'wt\\_1.'"---- MDPlot(data,c(1,3)) ## ----plot-mean-var, fig.cap="Mean-variance relationship for the two mutant lanes and all four lanes: the black line corresponds to the Poisson distribution (variance equal to the mean), while the red curve is a lowess fit.", fig.subcap=c("Mutant lanes", "All four lanes"), out.width='.49\\linewidth'---- meanVarPlot(data[,1:2], log=TRUE, ylim=c(0,16)) meanVarPlot(data, log=TRUE, ylim=c(0,16)) ## ----plot-gc, fig.cap="Lowess regression of the gene-level counts (log) on GC-content for each lane, color-coded by experimental condition."---- biasPlot(data, "gc", log=TRUE, ylim=c(1,5)) ## ----boxplot-gc, fig.cap="Boxplots of the log-fold-change between 'mut\\_1' and 'wt\\_1' lanes stratified by GC-content."---- lfc <- log(counts(data)[,3]+0.1) - log(counts(data)[,1]+0.1) biasBoxplot(lfc, fData(data)$gc) ## ----normalization------------------------------------------------------- dataWithin <- withinLaneNormalization(data,"gc", which="full") dataNorm <- betweenLaneNormalization(dataWithin, which="full") ## ----plot-gc-norm, fig.cap="Full-quantile within- and between-lane normalization. (a) Lowess regression of normalized gene-level counts (log) on GC-content for each lane. (b) Between-lane distribution of normalized gene-level counts (log).", fig.subcap=c("GC-content", "Count distribution"), out.width='.49\\linewidth'---- biasPlot(dataNorm, "gc", log=TRUE, ylim=c(1,5)) boxplot(dataNorm, col=colors) ## ----norm-offset--------------------------------------------------------- dataOffset <- withinLaneNormalization(data,"gc", which="full",offset=TRUE) dataOffset <- betweenLaneNormalization(dataOffset, which="full",offset=TRUE) ## ----edger--------------------------------------------------------------- library(edgeR) design <- model.matrix(~conditions, data=pData(dataOffset)) disp <- estimateGLMCommonDisp(counts(dataOffset), design, offset=-offst(dataOffset)) fit <- glmFit(counts(dataOffset), design, disp, offset=-offst(dataOffset)) lrt <- glmLRT(fit, coef=2) topTags(lrt) ## ----deseq--------------------------------------------------------------- library(DESeq) counts <- as(dataNorm, "CountDataSet") sizeFactors(counts) <- rep(1,4) counts <- estimateDispersions(counts) res <- nbinomTest(counts, "wt", "mut") head(res) ## ----unrounded----------------------------------------------------------- dataNorm <- betweenLaneNormalization(data, round=FALSE, offset=TRUE) norm1 <- normCounts(dataNorm) norm2 <- exp(log(counts(dataNorm) + 0.1 ) + offst(dataNorm)) - 0.1 head(norm1 - norm2) ## ----rounded------------------------------------------------------------- head(round(normCounts(dataNorm)) - round(counts(dataNorm) * exp(offst(dataNorm)))) ## ----getLengthAndGC------------------------------------------------------ getGeneLengthAndGCContent(id=c("ENSG00000012048", "ENSG00000139618"), org="hsa") ## ----getLengthAndGC-full, eval=FALSE------------------------------------- ## fData(data) <- getGeneLengthAndGCContent(featureNames(data), ## org="sacCer3", mode="org.db") ## ----sessionInfo, results="asis"----------------------------------------- toLatex(sessionInfo())