## ----style-knitr, eval=TRUE, echo=FALSE, results="asis"------------------ BiocStyle::latex() ## ----options, results='hide', message=FALSE, eval=TRUE, echo=FALSE------- library(chromstaR) options(width=110) ## ----univariate_narrow_library, results='hide', message=FALSE, eval=TRUE---- library(chromstaR) ## ----univariate_narrow_binning, results='markup', message=FALSE, eval=TRUE---- ## === Step 1: Binning === # Get an example BAM file file <- system.file("extdata","euratrans","lv-H3K4me3-BN-male-bio2-tech1.bam", package="chromstaRData") # Get the chromosome lengths (see ?GenomeInfoDb::fetchExtendedChromInfoFromUCSC) # This is only necessary for BED files. BAM files are handled automatically. data(rn4_chrominfo) head(rn4_chrominfo) # We use bin size 1000bp and chromosome 12 to keep the example quick binned.data <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, chromosomes='chr12') print(binned.data) ## ----univariate_narrow_peak_calling, results='markup', eval=TRUE, message=FALSE---- ## === Step 2: Peak calling === # We keep the posterior probabilities to be able to change the cutoff later in step 4) model <- callPeaksUnivariate(binned.data, verbosity=0, keep.posteriors=TRUE) ## ----univariate_narrow_plotting, fig.width=6, fig.height=4, out.width='0.5\\textwidth'---- ## === Step 3: Checking the fit === # For a narrow modification, the fit should look something like this, # with the 'modified'-component near the bottom of the figure plotHistogram(model) + ggtitle('H3K4me3') ## ----univariate_narrow_posterior, results='markup', message=FALSE, eval=TRUE---- ## === Step 4: Working with peaks === # Get the number and average size of peaks segments <- model$segments peaks <- segments[segments$state == 'modified'] length(peaks); mean(width(peaks)) # Change the posterior cutoff and get number of peaks model <- changePostCutoff(model, post.cutoff=0.999) segments <- model$segments peaks <- segments[segments$state == 'modified'] length(peaks); mean(width(peaks)) ## ----univariate_narrow_export, results='hide', message=FALSE, eval=TRUE---- ## === Step 5: Export to genome browser === # We can export peak calls and binned read counts with exportUnivariates(list(model), filename=tempfile(), what=c('peaks','counts')) ## ----univariate_broad_library, results='hide', message=FALSE, eval=TRUE---- library(chromstaR) ## ----univariate_broad_binning, results='hide', message=FALSE, eval=TRUE---- ## === Step 1: Binning === # Get an example BAM file file <- system.file("extdata","euratrans","lv-H3K27me3-BN-male-bio2-tech1.bam", package="chromstaRData") # Get the chromosome lengths (see ?GenomeInfoDb::fetchExtendedChromInfoFromUCSC) # This is only necessary for BED files. BAM files are handled automatically. data(rn4_chrominfo) head(rn4_chrominfo) # We use bin size 1000bp and chromosome 12 to keep the example quick binned.data <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, chromosomes='chr12') ## ----univariate_broad_peak_calling, results='markup', eval=TRUE, message=FALSE---- ## === Step 2: Peak calling === # We keep the posterior probabilities to be able to change the cutoff later in step 4) model <- callPeaksUnivariate(binned.data, verbosity=0, keep.posteriors=TRUE) ## ----univariate_broad_plotting, fig.width=6, fig.height=4, out.width='0.5\\textwidth'---- ## === Step 3: Checking the fit === # For a broad modification, the fit should look something like this, # with a 'modified'-component that fits the thick tail of the distribution. plotHistogram(model) + ggtitle('H3K27me3') ## ----univariate_broad_posterior, results='markup', message=FALSE, eval=TRUE---- ## === Step 4: Working with peaks === # Get the number and average size of peaks segments <- model$segments peaks <- segments[segments$state == 'modified'] length(peaks); mean(width(peaks)) # Change the posterior cutoff and get number of peaks # !! Note that for broad peaks, setting a posterior cutoff # will split the peaks into smaller, better defined peaks !! model <- changePostCutoff(model, post.cutoff=0.999) segments <- model$segments peaks <- segments[segments$state == 'modified'] length(peaks); mean(width(peaks)) ## ----univariate_broad_export, results='hide', message=FALSE, eval=TRUE---- ## === Step 5: Export to genome browser === # We can export peak calls and binned read counts with exportUnivariates(list(model), filename=tempfile(), what=c('peaks','counts')) ## ----univariate_broad_H4K20me1, echo=TRUE, results='hide', message=FALSE, fig.width=6, fig.height=4, out.width='0.5\\textwidth'---- ## === Step 1-3: Another example for mark H4K20me1 === file <- system.file("extdata","euratrans","lv-H4K20me1-BN-male-bio1-tech1.bam", package="chromstaRData") data(rn4_chrominfo) binned.data <- binReads(file, assembly=rn4_chrominfo, binsizes=1000, chromosomes='chr12') model <- callPeaksUnivariate(binned.data, max.time=60, verbosity=0) plotHistogram(model) + ggtitle('H4K20me1') ## ----univariate_replicate_library, results='hide', message=FALSE, eval=TRUE---- library(chromstaR) ## ----multivariate_replicate_binning, results='markup', message=FALSE, eval=TRUE---- ## === Step 1: Binning === # Let's get some example data with 3 replicates in spontaneous hypertensive rat (SHR) file.path <- system.file("extdata","euratrans", package='chromstaRData') files.good <- list.files(file.path, pattern="H3K27me3.*SHR.*bam$", full.names=TRUE)[1:3] # We fake a replicate with poor quality by taking a different mark entirely files.poor <- list.files(file.path, pattern="H4K20me1.*SHR.*bam$", full.names=TRUE)[1] files <- c(files.good, files.poor) # Obtain chromosome lengths. This is only necessary for BED files. BAM files are # handled automatically. data(rn4_chrominfo) head(rn4_chrominfo) # Define experiment structure exp <- data.frame(file=files, mark='H3K27me3', condition='SHR', replicate=1:4, pairedEndReads=FALSE, controlFiles=NA) # We use bin size 1000bp and chromosome 12 to keep the example quick binned.data <- list() for (file in files) { binned.data[[basename(file)]] <- binReads(file, binsize=1000, assembly=rn4_chrominfo, chromosomes='chr12', experiment.table=exp) } ## ----multivariate_replicate_univariate, results='hide', message=FALSE, eval=TRUE---- ## === Step 2: Univariate peak calling === # The univariate fit is obtained for each replicate models <- list() for (i1 in 1:length(binned.data)) { models[[i1]] <- callPeaksUnivariate(binned.data[[i1]], max.time=60) } ## ----multivariate_replicate_peak_calling, results='markup', message=FALSE, eval=TRUE---- ## === Step 3: Check replicate correlation === # We run a multivariate peak calling on all 4 replicates # A warning is issued because replicate 4 is very different from the others multi.model <- callPeaksReplicates(models, max.time=60, eps=1) # Checking the correlation confirms that Rep4 is very different from the others print(multi.model$replicateInfo$correlation) ## ----multivariate_replicate_peak_calling2, results='hide', message=FALSE, eval=TRUE---- ## === Step 4: Peak calling with replicates === # We redo the previous step without the "bad" replicate # Also, we force all replicates to agree in their peak calls multi.model <- callPeaksReplicates(models[1:3], force.equal=TRUE, max.time=60) ## ----multivariate_replicate_export, results='hide', message=FALSE, eval=TRUE---- ## === Step 5: Export to genome browser === # Finally, we can export the results as BED file exportMultivariate(multi.model, filename=tempfile(), what=c('peaks','counts')) ## ----multivariate_differential_library, results='hide', message=FALSE, eval=TRUE---- library(chromstaR) ## ----multivariate_differential_preparation, results='markup', message=FALSE, eval=TRUE---- #=== Step 1: Preparation === ## Prepare the file paths. Exchange this with your input and output directories. inputfolder <- system.file("extdata","euratrans", package="chromstaRData") outputfolder <- file.path(tempdir(), 'H4K20me1-example') ## Define experiment structure, put NA if you don't have controls data(experiment_table_H4K20me1) print(experiment_table_H4K20me1) ## Define assembly # This is only necessary if you have BED files, BAM files are handled automatically. # For common assemblies you can also specify them as 'hg19' for example. data(rn4_chrominfo) head(rn4_chrominfo) ## ----multivariate_differential_Chromstar, results='hide', message=FALSE, eval=TRUE---- #=== Step 2: Run Chromstar === ## Run ChromstaR Chromstar(inputfolder, experiment.table=experiment_table_H4K20me1, outputfolder=outputfolder, numCPU=4, binsize=1000, assembly=rn4_chrominfo, prefit.on.chr='chr12', chromosomes='chr12', mode='differential') ## ----multivariate_differential_listfiles, results='markup', message=FALSE, eval=TRUE---- ## Results are stored in 'outputfolder' and can be loaded for further processing list.files(outputfolder) model <- get(load(file.path(outputfolder,'multivariate', 'multivariate_mode-differential_mark-H4K20me1.RData'))) ## ----multivariate_differential_stateBrewer, results='markup', message=TRUE, eval=TRUE---- ## === Step 3: Construct differential and common states === diff.states <- stateBrewer(experiment_table_H4K20me1, mode='differential', differential.states=TRUE) print(diff.states) common.states <- stateBrewer(experiment_table_H4K20me1, mode='differential', common.states=TRUE) print(common.states) ## ----multivariate_differential_export, results='hide', message=FALSE, eval=TRUE---- ## === Step 5: Export to genome browser === # Export only differential states exportMultivariate(model, filename=tempfile(), what=c('peaks','counts','combinations'), include.states=diff.states) ## ----multivariate_combinatorial_library, results='hide', message=FALSE, eval=TRUE---- library(chromstaR) ## ----multivariate_combinatorial_preparation, results='markup', message=FALSE, eval=TRUE---- #=== Step 1: Preparation === ## Prepare the file paths. Exchange this with your input and output directories. inputfolder <- system.file("extdata","euratrans", package="chromstaRData") outputfolder <- file.path(tempdir(), 'SHR-example') ## Define experiment structure, put NA if you don't have controls # (SHR = spontaneous hypertensive rat) data(experiment_table_SHR) print(experiment_table_SHR) ## Define assembly # This is only necessary if you have BED files, BAM files are handled automatically. # For common assemblies you can also specify them as 'hg19' for example. data(rn4_chrominfo) head(rn4_chrominfo) ## ----multivariate_combinatorial_Chromstar, results='hide', message=FALSE, eval=TRUE---- #=== Step 2: Run Chromstar === ## Run ChromstaR Chromstar(inputfolder, experiment.table=experiment_table_SHR, outputfolder=outputfolder, numCPU=4, binsize=1000, assembly=rn4_chrominfo, prefit.on.chr='chr12', chromosomes='chr12', mode='combinatorial') ## ----multivariate_combinatorial_listfiles, results='markup', message=FALSE, eval=TRUE, fig.width=4, fig.height=3, out.width='0.5\\textwidth'---- ## Results are stored in 'outputfolder' and can be loaded for further processing list.files(outputfolder) model <- get(load(file.path(outputfolder,'multivariate', 'multivariate_mode-combinatorial_condition-SHR.RData'))) # Obtain genomic frequencies for combinatorial states genomicFrequencies(model) # Plot transition probabilities and read count correlation heatmapTransitionProbs(model) + ggtitle('Transition probabilities') heatmapCountCorrelation(model) + ggtitle('Read count correlation') ## ----multivariate_combinatorial_enrichment, results='markup', message=FALSE, eval=TRUE---- ## === Step 3: Enrichment analysis === # Annotations can easily be obtained with the biomaRt package. Of course, you can # also load them from file if you already have annotation files available. library(biomaRt) ensembl <- useMart('ENSEMBL_MART_ENSEMBL', host='may2012.archive.ensembl.org', dataset='rnorvegicus_gene_ensembl') genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_id', 'gene_biotype'), mart=ensembl) # Transform to GRanges for easier handling genes <- GRanges(seqnames=paste0('chr',genes$chromosome_name), ranges=IRanges(start=genes$start, end=genes$end), strand=genes$strand, name=genes$external_gene_id, biotype=genes$gene_biotype) print(genes) ## ----multivariate_combinatorial_enrichment_plot1, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # We expect promoter [H3K4me3] and bivalent-promoter signatures [H3K4me3+H3K27me3] # to be enriched at transcription start sites. plotFoldEnrichment(hmm = model, annotation = genes, bp.around.annotation = 15000) + ggtitle('Fold enrichment around genes') + xlab('distance from gene body') # Plot enrichment only at TSS. We make use of the fact that TSS is the start of a gene. plotFoldEnrichment(model, genes, region = 'start') + ggtitle('Fold enrichment around TSS') + xlab('distance from TSS in [bp]') # Note: If you want to facet the plot because you have many combinatorial states you # can do that with plotFoldEnrichment(model, genes, region = 'start') + facet_wrap(~ combination) + ggtitle('Fold enrichment around TSS') ## ----multivariate_combinatorial_enrichment_plot2, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # Another form of visualization that shows every TSS in a heatmap tss <- resize(genes, width = 3, fix = 'start') plotEnrichCountHeatmap(model, tss) + theme(strip.text.x = element_text(size=6)) + ggtitle('Read count around TSS') ## ----multivariate_combinatorial_enrichment_plot3, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # Fold enrichment with different biotypes, showing that protein coding genes are # enriched with (bivalent) promoter combinations [H3K4me3] and [H3K4me3+H3K27me3], # while rRNA is enriched with the empty [] combination. biotypes <- split(tss, tss$biotype) plotFoldEnrichHeatmap(model, annotations=biotypes) + coord_flip() + ggtitle('Fold enrichment with different biotypes') ## ----multivariate_combinatorial_expression, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.5\\textwidth', fig.align='center'---- # === Step 4: Expression analysis === # Suppose you have expression data as well for your experiment. The following code # shows you how to get the expression values for each combinatorial state. data(expression_lv) head(expression_lv) # We need to get coordinates for each of the genes library(biomaRt) ensembl <- useMart('ENSEMBL_MART_ENSEMBL', host='may2012.archive.ensembl.org', dataset='rnorvegicus_gene_ensembl') genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_id', 'gene_biotype'), mart=ensembl) expr <- merge(genes, expression_lv, by='ensembl_gene_id') # Transform to GRanges expression.SHR <- GRanges(seqnames=paste0('chr',expr$chromosome_name), ranges=IRanges(start=expr$start, end=expr$end), strand=expr$strand, name=expr$external_gene_id, biotype=expr$gene_biotype, expression=expr$expression_SHR) # We apply an asinh transformation to reduce the effect of outliers expression.SHR$expression <- asinh(expression.SHR$expression) ## Plot plotExpression(model, expression.SHR) + theme(axis.text.x=element_text(angle=0, hjust=0.5)) + ggtitle('Expression of genes overlapping combinatorial states') plotExpression(model, expression.SHR, return.marks=TRUE) + ggtitle('Expression of marks overlapping combinatorial states') ## ----combined_library, results='hide', message=FALSE, eval=TRUE---------- library(chromstaR) ## ----combined_preparation, results='markup', message=FALSE, eval=TRUE---- #=== Step 1: Preparation === ## Prepare the file paths. Exchange this with your input and output directories. inputfolder <- system.file("extdata","euratrans", package="chromstaRData") outputfolder <- file.path(tempdir(), 'SHR-BN-example') ## Define experiment structure, put NA if you don't have controls data(experiment_table) print(experiment_table) ## Define assembly # This is only necessary if you have BED files, BAM files are handled automatically. # For common assemblies you can also specify them as 'hg19' for example. data(rn4_chrominfo) head(rn4_chrominfo) ## ----combined_Chromstar, results='hide', message=FALSE, eval=TRUE-------- #=== Step 2: Run Chromstar === ## Run ChromstaR Chromstar(inputfolder, experiment.table=experiment_table, outputfolder=outputfolder, numCPU=4, binsize=1000, assembly=rn4_chrominfo, prefit.on.chr='chr12', chromosomes='chr12', mode='differential') ## ----combined_listfiles, results='markup', message=FALSE, eval=TRUE------ ## Results are stored in 'outputfolder' and can be loaded for further processing list.files(outputfolder) model <- get(load(file.path(outputfolder,'combined', 'combined_mode-differential.RData'))) ## ----combined_analysis, results='markup', message=FALSE, eval=TRUE------- #=== Step 3: Analysis and export === ## Obtain all genomic regions where the two tissues have different states segments <- model$segments diff.segments <- segments[segments$combination.SHR != segments$combination.BN] # Let's be strict with the differences and get only those where both marks are different diff.segments <- diff.segments[diff.segments$differential.score >= 1.9] exportGRangesAsBedFile(diff.segments, trackname='differential_chromatin_states', filename=tempfile(), scorecol='differential.score') ## Obtain all genomic regions where we find a bivalent promoter in BN but not in SHR bivalent.BN <- segments[segments$combination.BN == '[H3K27me3+H3K4me3]' & segments$combination.SHR != '[H3K27me3+H3K4me3]'] ## Obtain all genomic regions where BN and SHR have promoter signatures promoter.BN <- segments[segments$transition.group == 'constant [H3K4me3]'] ## Get transition frequencies print(model$frequencies) ## ----combined_enrichment, results='markup', message=FALSE, eval=TRUE----- ## === Step 4: Enrichment analysis === # Annotations can easily be obtained with the biomaRt package. Of course, you can # also load them from file if you already have annotation files available. library(biomaRt) ensembl <- useMart('ENSEMBL_MART_ENSEMBL', host='may2012.archive.ensembl.org', dataset='rnorvegicus_gene_ensembl') genes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'external_gene_id', 'gene_biotype'), mart=ensembl) # Transform to GRanges for easier handling genes <- GRanges(seqnames=paste0('chr',genes$chromosome_name), ranges=IRanges(start=genes$start, end=genes$end), strand=genes$strand, name=genes$external_gene_id, biotype=genes$gene_biotype) print(genes) ## ----combined_enrichment_plot1, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # We expect promoter [H3K4me3] and bivalent-promoter signatures [H3K4me3+H3K27me3] # to be enriched at transcription start sites. plots <- plotFoldEnrichment(hmm=model, annotation=genes, region='start') plots[['BN']] + facet_wrap(~ combination) + ggtitle('Fold enrichment around TSS') + xlab('distance from TSS') plots <- plotFoldEnrichment(hmm=model, annotation=genes, region='start', what='peaks') plots[['BN']] + facet_wrap(~ mark) + ggtitle('Fold enrichment around TSS') + xlab('distance from TSS') ## ----combined_enrichment_plot3, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, out.width='0.8\\textwidth', fig.align='center'---- # Fold enrichment with different biotypes, showing that protein coding genes are # enriched with (bivalent) promoter combinations [H3K4me3] and [H3K4me3+H3K27me3], # while rRNA is enriched with the empty [] combination. tss <- resize(genes, width = 3, fix = 'start') biotypes <- split(tss, tss$biotype) plots <- plotFoldEnrichHeatmap(model, annotations=biotypes) plots[['BN']] + coord_flip() + scale_fill_gradient(low='white', high='blue', limits=c(0,100)) + ggtitle('Fold enrichment with different biotypes') ## ----faq_postcutoff, results='markup', message=FALSE, eval=TRUE---------- model <- get(load(file.path(outputfolder,'combined', 'combined_mode-differential.RData'))) # Set a strict cutoff close to 1 model2 <- changePostCutoff(model, post.cutoff=0.999) ## Compare the number and width of peaks before and after cutoff adjustment length(model$segments); mean(width(model$segments)) length(model2$segments); mean(width(model2$segments)) ## ----faq_postcutoff_single, results='markup', message=FALSE, eval=TRUE---- # Set a stricter cutoff for H3K4me3 than for H3K27me3 cutoffs <- c(0.8, 0.8, 0.8, 0.8, 0.999, 0.999, 0.999, 0.999) names(cutoffs) <- model$info$ID print(cutoffs) model2 <- changePostCutoff(model, post.cutoff=cutoffs) ## Compare the number and width of peaks before and after cutoff adjustment length(model$segments); mean(width(model$segments)) length(model2$segments); mean(width(model2$segments)) ## ----faq_heatmapCombinations, results='markup', message=FALSE, eval=TRUE, fig.width=8, fig.height=4, fig.align='center', out.width='0.8\\textwidth'---- heatmapCombinations(marks=c('H3K4me3', 'H3K27me3', 'H3K36me3', 'H3K27Ac')) ## ----sessionInfo, eval=TRUE, results="asis"------------------------------ toLatex(sessionInfo())