## ----style, echo = FALSE, results = 'asis'------------------------------------ BiocStyle::markdown() ## ----global_options, include = FALSE------------------------------------------ knitr::opts_chunk$set(prompt = TRUE, collapse = TRUE, fig.align = "center") ## ---- fig.cap="transcriptR workflow", fig.align='center', echo=FALSE---------- knitr::include_graphics(path = "./images/Main_scheme.png") ## ----eval = FALSE------------------------------------------------------------- # if (!requireNamespace("BiocManager", quietly=TRUE)) # install.packages("BiocManager") # BiocManager::install("transcriptR") ## ----eval = TRUE, echo = FALSE, message = FALSE------------------------------- library(transcriptR) ## ----cache = FALSE, echo = TRUE, results = 'hide', message = FALSE------------ library(GenomicFeatures) library(TxDb.Hsapiens.UCSC.hg19.knownGene) # Use UCSC genes as the reference annotations knownGene <- TxDb.Hsapiens.UCSC.hg19.knownGene # Extract genes information knownGene.genes <- genes(knownGene) ## ---- cache = FALSE----------------------------------------------------------- # load TranscriptionDataSet object data(tds) # view TranscriptionDataSet object tds ## ---- eval = FALSE------------------------------------------------------------ # # or initialize a new one (do not run the following code) # # specify a region of interest (optional) # region <- GRanges(seqnames = "chr15", ranges = IRanges(start = 63260000, end = 63300000)) # # object construction # tds <- constructTDS(file = path.to.Bam, # region = region, # fragment.size = 250, # unique = FALSE, # paired.end = FALSE, # swap.strand = TRUE) ## ----------------------------------------------------------------------------- estimateBackground(tds, fdr.cutoff = 0.01) # view estimated cutoff value tds ## ----eval = FALSE------------------------------------------------------------- # # look at the coverage profile of the regions expressed above the background level # exportCoverage(object = tds, file = "plus.bw", type = "bigWig", strand = "+", # filter.by.coverage.cutoff = TRUE, rpm = FALSE) # # # or check the raw coverage (all expressed regions) # exportCoverage(object = tds, file = "plus_raw.bw", type = "bigWig", strand = "+", # filter.by.coverage.cutoff = FALSE, rpm = FALSE) ## ----------------------------------------------------------------------------- # create a range of gap distances to test # from 0 bp to 10000 bp with the step of 100 bp gd <- seq(from = 0, to = 10000, by = 100) estimateGapDistance(object = tds, annot = knownGene.genes, filter.annot = TRUE, fpkm.quantile = 0.25, gap.dist.range = gd) # view the optimal gap distance minimazing the sum of two errors tds ## ----fig.cap="Gap distance error rate", fig.height=7, fig.width=7, fig.align='center'---- # get intermediate calculation gdTest <- getTestedGapDistances(tds) head(gdTest) # plot error rates plotErrorRate(tds, lwd = 2) ## ----------------------------------------------------------------------------- detectTranscripts(tds, estimate.params = TRUE) ## ----------------------------------------------------------------------------- annotateTranscripts(object = tds, annot = knownGene.genes, min.overlap = 0.5) ## ----------------------------------------------------------------------------- trx <- getTranscripts(tds, min.length = 250, min.fpkm = 0.5) head(trx, 5) ## ----eval = FALSE------------------------------------------------------------- # transcriptsToBed(object = trx, file = "transcripts.bed", strand.color = c("blue", "red")) ## ---- fig.cap="Classification of the peaks on gene associated and background", fig.align='center', echo=FALSE---- knitr::include_graphics(path = "./images/Gene_associated_peak_prediction.png") ## ---- fig.cap="Average RNA-seq signal across ChIP peak region", fig.align ='center', echo=FALSE---- knitr::include_graphics(path = "./images/Average_RNAseq_signal_across_peak_region.png") ## ---- fig.cap='ChIP peak "strandedness" prediction', fig.align='center', echo=FALSE---- knitr::include_graphics(path = "./images/Peak_strandedness_prediction.png") ## ---- cache = FALSE----------------------------------------------------------- # load ChipDataSet object data(cds) cds ## ---- eval = FALSE------------------------------------------------------------ # # or initialize a new one (do not run the following code) # # specify the region of interest (optional) # region <- GRanges(seqnames = "chr15", ranges = IRanges(start = 63260000, end = 63300000)) # # object construction # cds <- constructCDS(peaks = path.to.peaks, # path to a peak file # reads = path.to.reads, # path to a Bam file with reads # region = region, # TxDb = knownGene, # annotation database to evaluate # # genomic distribution of the peaks # tssOf = "transcript", # genomic feature to extract TSS region # tss.region = c(-2000, 2000), # size of the TSS region, # # from -2kb to +2 kb # reduce.peaks = TRUE, # merge neighboring peaks # gapwidth = 500, # min. gap distance between peaks # unique = TRUE, # swap.strand = FALSE) ## ----eval = TRUE-------------------------------------------------------------- peaks <- getPeaks(cds) head(peaks, 3) ## ----cache = FALSE------------------------------------------------------------ getGenomicAnnot(cds) ## ---- fig.cap="Genomic distribution of the peaks", fig.height=7, fig.width=7, fig.align='center'---- plotGenomicAnnot(object = cds, plot.type = "distr") ## ---- fig.cap="Enrichment of peaks at distinct genomic features", fig.height=7, fig.width=7, fig.align='center'---- plotGenomicAnnot(object = cds, plot.type = "enrich") ## ----eval=TRUE, fig.cap="Estimated features", fig.height=5, fig.width=10, fig.align='center'---- plotFeatures(cds, plot.type = "box") ## ----eval = TRUE-------------------------------------------------------------- predictTssOverlap(object = cds, feature = "pileup", p = 0.75) cds ## ----------------------------------------------------------------------------- peaks <- getPeaks(cds) head(peaks, 3) ## ----eval = TRUE-------------------------------------------------------------- getConfusionMatrix(cds) getPredictorSignificance(cds) ## ---- fig.cap="Performance of the classification model (gene associated peaks prediction)", eval=TRUE, fig.width=6, fig.height=6, fig.align='center'---- plotROC(object = cds, col = "red3", grid = TRUE, auc.polygon = TRUE) ## ----eval = TRUE-------------------------------------------------------------- predictStrand(cdsObj = cds, tdsObj = tds, quant.cutoff = 0.1, win.size = 2000) cds peaks <- getPeaks(cds) head(peaks[ peaks$predicted.tssOverlap == "yes" ], 3) ## ----eval = TRUE-------------------------------------------------------------- df <- getQuadProb(cds, strand = "+") head(df, 3) ## ----eval = TRUE-------------------------------------------------------------- getProbTreshold(cds) ## ----eval = FALSE------------------------------------------------------------- # peaksToBed(object = cds, file = "peaks.bed", gene.associated.peaks = TRUE) ## ----------------------------------------------------------------------------- # set `estimate.params = TRUE` to re-calculate FPKM and coverage density breakTranscriptsByPeaks(tdsObj = tds, cdsObj = cds, estimate.params = TRUE) # re-annotate identified transcripts annotateTranscripts(object = tds, annot = knownGene.genes, min.overlap = 0.5) # retrieve the final set of transcripts trx.final <- getTranscripts(tds) ## ----eval = FALSE------------------------------------------------------------- # # visualize the final set of transcripts in a UCSC genome browser # transcriptsToBed(object = trx.final, file = "transcripts_final.bed") ## ----------------------------------------------------------------------------- hits <- findOverlaps(query = trx, subject = trx.final) trx.broken <- trx[unique(queryHits(hits)[duplicated(queryHits(hits))])] head(trx.broken, 5) ## ----echo=FALSE--------------------------------------------------------------- sessionInfo()