## ----style, echo=FALSE, results='hide', message=FALSE------------------------- library(BiocStyle) library(knitr) opts_chunk$set(error=FALSE, message=FALSE, warning=FALSE, cache=TRUE) ## ----------------------------------------------------------------------------- library(chipseqDBData) h3k27me3data <- H3K27me3Data() h3k27me3data ## ----------------------------------------------------------------------------- library(Rsamtools) diagnostics <- list() for (bam in h3k27me3data$Path) { total <- countBam(bam)$records mapped <- countBam(bam, param=ScanBamParam( flag=scanBamFlag(isUnmapped=FALSE)))$records marked <- countBam(bam, param=ScanBamParam( flag=scanBamFlag(isUnmapped=FALSE, isDuplicate=TRUE)))$records diagnostics[[basename(bam)]] <- c(Total=total, Mapped=mapped, Marked=marked) } diag.stats <- data.frame(do.call(rbind, diagnostics)) diag.stats$Prop.mapped <- diag.stats$Mapped/diag.stats$Total*100 diag.stats$Prop.marked <- diag.stats$Marked/diag.stats$Mapped*100 diag.stats ## ----------------------------------------------------------------------------- library(BiocFileCache) bfc <- BiocFileCache("local", ask=FALSE) black.path <- bfcrpath(bfc, file.path("http://hgdownload.cse.ucsc.edu", "goldenPath/mm10/bigZips/chromOut.tar.gz")) tmpdir <- tempfile() dir.create(tmpdir) untar(black.path, exdir=tmpdir) # Iterate through all chromosomes. collected <- list() for (x in list.files(tmpdir, full=TRUE)) { f <- list.files(x, full=TRUE, pattern=".fa.out") to.get <- vector("list", 15) to.get[[5]] <- "character" to.get[6:7] <- "integer" collected[[length(collected)+1]] <- read.table(f, skip=3, stringsAsFactors=FALSE, colClasses=to.get) } collected <- do.call(rbind, collected) blacklist <- GRanges(collected[,1], IRanges(collected[,2], collected[,3])) blacklist ## ----------------------------------------------------------------------------- library(csaw) param <- readParam(minq=10, discard=blacklist) param ## ----------------------------------------------------------------------------- win.data <- windowCounts(h3k27me3data$Path, param=param, width=2000, spacing=500, ext=200) win.data ## ----------------------------------------------------------------------------- bins <- windowCounts(h3k27me3data$Path, bin=TRUE, width=10000, param=param) win.data <- normFactors(bins, se.out=win.data) (normfacs <- win.data$norm.factors) ## ----compoplot, fig.width=12, fig.asp=0.5, fig.cap="Mean-difference plots for the bin counts, comparing sample 1 to all other samples. The red line represents the log-ratio of the normalization factors between samples."---- bin.ab <- scaledAverage(bins) adjc <- calculateCPM(bins, use.norm.factors=FALSE) par(cex.lab=1.5, mfrow=c(1,3)) smoothScatter(bin.ab, adjc[,1]-adjc[,2], ylim=c(-6, 6), xlab="Average abundance", ylab="Log-ratio (1 vs 2)") abline(h=log2(normfacs[1]/normfacs[4]), col="red") smoothScatter(bin.ab, adjc[,1]-adjc[,3], ylim=c(-6, 6), xlab="Average abundance", ylab="Log-ratio (1 vs 3)") abline(h=log2(normfacs[2]/normfacs[4]), col="red") smoothScatter(bin.ab, adjc[,1]-adjc[,4], ylim=c(-6, 6), xlab="Average abundance", ylab="Log-ratio (1 vs 4)") abline(h=log2(normfacs[3]/normfacs[4]), col="red") ## ----------------------------------------------------------------------------- filter.stat <- filterWindowsGlobal(win.data, bins) min.fc <- 2 ## ----bghistplot, fig.cap="Histogram of average abundances across all 10 kbp genomic bins. The filter threshold is shown as the red line."---- hist(filter.stat$back.abundances, main="", breaks=50, xlab="Background abundance (log2-CPM)") threshold <- filter.stat$abundances[1] - filter.stat$filter[1] + log2(min.fc) abline(v=threshold, col="red") ## ----------------------------------------------------------------------------- keep <- filter.stat$filter > log2(min.fc) summary(keep) filtered.data <- win.data[keep,] ## ----------------------------------------------------------------------------- library(edgeR) y <- asDGEList(filtered.data) str(y) ## ----------------------------------------------------------------------------- genotype <- h3k27me3data$Description genotype[grep("control", genotype)] <- "wt" genotype[grep("knock-out", genotype)] <- "ko" genotype <- factor(genotype) design <- model.matrix(~0+genotype) colnames(design) <- levels(genotype) design ## ----bcvplot, fig.cap="Abundance-dependent trend in the BCV for each window, represented by the blue line. Common (red) and tagwise estimates (black) are also shown."---- y <- estimateDisp(y, design) summary(y$trended.dispersion) plotBCV(y) ## ----qlplot, fig.cap="Effect of EB shrinkage on the raw QL dispersion estimate for each window (black) towards the abundance-dependent trend (blue) to obtain squeezed estimates (red)."---- fit <- glmQLFit(y, design, robust=TRUE) summary(fit$df.prior) plotQLDisp(fit) ## ----mdsplot, fig.cap="MDS plot with two dimensions for all samples in the H3K27me3 data set. Samples are labelled and coloured according to the genotype. A larger top set of windows was used to improve the visualization of the genome-wide differences between the WT samples."---- plotMDS(cpm(y, log=TRUE), top=10000, labels=genotype, col=c("red", "blue")[as.integer(genotype)]) ## ----------------------------------------------------------------------------- contrast <- makeContrasts(wt-ko, levels=design) res <- glmQLFTest(fit, contrast=contrast) ## ----------------------------------------------------------------------------- # Counting into 500 bp windows. win.data2 <- windowCounts(h3k27me3data$Path, param=param, width=500, spacing=100, ext=200) # Re-using the same normalization factors. win.data2$norm.factors <- win.data$norm.factors # Filtering on abundance. filter.stat2 <- filterWindowsGlobal(win.data2, bins) keep2 <- filter.stat2$filter > log2(min.fc) filtered.data2 <- win.data2[keep2,] # Performing the statistical analysis. y2 <- asDGEList(filtered.data2) y2 <- estimateDisp(y2, design) fit2 <- glmQLFit(y2, design, robust=TRUE) res2 <- glmQLFTest(fit2, contrast=contrast) ## ----------------------------------------------------------------------------- merged <- mergeResultsList(list(filtered.data, filtered.data2), tab.list=list(res$table, res2$table), equiweight=TRUE, tol=100, merge.args=list(max.width=30000)) merged$regions ## ----------------------------------------------------------------------------- tabcom <- merged$combined is.sig <- tabcom$FDR <= 0.05 summary(is.sig) ## ----------------------------------------------------------------------------- table(tabcom$direction[is.sig]) ## ----------------------------------------------------------------------------- tabbest <- merged$best is.sig.pos <- (tabbest$rep.logFC > 0)[is.sig] summary(is.sig.pos) ## ----------------------------------------------------------------------------- out.ranges <- merged$regions mcols(out.ranges) <- data.frame(tabcom, best.logFC=tabbest$rep.logFC) saveRDS(file="h3k27me3_results.rds", out.ranges) ## ----------------------------------------------------------------------------- library(TxDb.Mmusculus.UCSC.mm10.knownGene) library(org.Mm.eg.db) anno <- detailRanges(out.ranges, orgdb=org.Mm.eg.db, txdb=TxDb.Mmusculus.UCSC.mm10.knownGene) mcols(out.ranges) <- cbind(mcols(out.ranges), anno) ## ----------------------------------------------------------------------------- cdx2 <- genes(TxDb.Mmusculus.UCSC.mm10.knownGene)["12591"] # Cdx2 Entrez ID cur.region <- subsetByOverlaps(out.ranges, cdx2)[1] cur.region ## ---- results="hide", echo=FALSE---------------------------------------------- if (cur.region$FDR > 0.05 || cur.region$best.logFC < 0) { stop("Cdx2 should be significantly upregulated in WT") } ## ----------------------------------------------------------------------------- library(Gviz) gax <- GenomeAxisTrack(col="black", fontsize=15, size=2) greg <- GeneRegionTrack(TxDb.Mmusculus.UCSC.mm10.knownGene, showId=TRUE, geneSymbol=TRUE, name="", background.title="transparent") symbols <- unlist(mapIds(org.Mm.eg.db, gene(greg), "SYMBOL", "ENTREZID", multiVals = "first")) symbol(greg) <- symbols[gene(greg)] ## ----tfplot, fig.width=8, fig.asp=0.75, fig.cap="Coverage tracks for a region with H3K27me3 enrichment in KO (top two tracks) against the WT (last two tracks)."---- collected <- list() lib.sizes <- filtered.data$totals/1e6 for (i in seq_along(h3k27me3data$Path)) { reads <- extractReads(bam.file=h3k27me3data$Path[i], cur.region, param=param) cov <- as(coverage(reads)/lib.sizes[i], "GRanges") collected[[i]] <- DataTrack(cov, type="histogram", lwd=0, ylim=c(0,1), name=h3k27me3data$Description[i], col.axis="black", col.title="black", fill="darkgray", col.histogram=NA) } plotTracks(c(gax, collected, greg), chromosome=as.character(seqnames(cur.region)), from=start(cur.region), to=end(cur.region)) ## ----------------------------------------------------------------------------- col1a2 <- genes(TxDb.Mmusculus.UCSC.mm10.knownGene)["12843"] # Col1a2 Entrez ID cur.region <- subsetByOverlaps(out.ranges, col1a2) cur.region ## ---- results="hide", echo=FALSE---------------------------------------------- if (cur.region$FDR < 0.05) { stop("Col1a2 should not be significantly DB") } ## ----------------------------------------------------------------------------- sessionInfo()