The purpose of this quick start is to introduce the four newly implemented functions, toRanges
, annoGO
, annotatePeakInBatch
, and addGeneIDs
in the new version of the ChIPpeakAnno. With those wrapper functions, the annotation of ChIP-Seq peaks becomes streamlined into four major steps:
1 Read peak data with toGRanges
2 Generate annotation data with toGRanges
3 Annotate peaks with annotatePeakInBatch
4 Add additional information with addGeneIDs
Most of time user can use the default settings of the arguments of those functions. This makes the annotation pipeline straightforward and easy to use.
Note that the version of the annotation data must match with the genome used for mapping because the coordinates may differ for different genome releases. For example, if you are using Mus_musculus.v103 for mapping, you’d best also use EnsDb.Mmusculus.v103 for annotation. For more information about how to prepare the annotation data, please refer ?getAnnotation.
GRanges
with toGRanges
## First, load the ChIPpeakAnno package
library(ChIPpeakAnno)
path <- system.file("extdata", "Tead4.broadPeak", package="ChIPpeakAnno")
peaks <- toGRanges(path, format="broadPeak")
peaks[1:2]
## GRanges object with 2 ranges and 4 metadata columns:
## seqnames ranges strand | score signalValue pValue
## <Rle> <IRanges> <Rle> | <integer> <numeric> <numeric>
## peak12338 chr2 175473-176697 * | 206 668.42 -1
## peak12339 chr2 246412-246950 * | 31 100.23 -1
## qValue
## <numeric>
## peak12338 -1
## peak12339 -1
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
toGRanges
library(EnsDb.Hsapiens.v75)
annoData <- toGRanges(EnsDb.Hsapiens.v75)
annoData[1:2]
## GRanges object with 2 ranges and 1 metadata column:
## seqnames ranges strand | gene_name
## <Rle> <IRanges> <Rle> | <character>
## ENSG00000223972 chr1 11869-14412 + | DDX11L1
## ENSG00000227232 chr1 14363-29806 - | WASH7P
## -------
## seqinfo: 273 sequences (1 circular) from 2 genomes (hg19, GRCh37)
annotatePeakInBatch
## keep the seqnames in the same style if needed
seqlevelsStyle(peaks) <- seqlevelsStyle(annoData)
## do annotation by nearest TSS
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData)
anno[1:2]
## GRanges object with 2 ranges and 13 metadata columns:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## peak12338.ENSG00000227061 chr2 175473-176697 * | 206
## peak12339.ENSG00000143727 chr2 246412-246950 * | 31
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <character>
## peak12338.ENSG00000227061 668.42 -1 -1 peak12338
## peak12339.ENSG00000143727 100.23 -1 -1 peak12339
## feature start_position end_position
## <character> <integer> <integer>
## peak12338.ENSG00000227061 ENSG00000227061 197569 202605
## peak12339.ENSG00000143727 ENSG00000143727 264140 278283
## feature_strand insideFeature distancetoFeature
## <character> <character> <numeric>
## peak12338.ENSG00000227061 + upstream -22096
## peak12339.ENSG00000143727 + upstream -17728
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## peak12338.ENSG00000227061 20872 NearestLocation
## peak12339.ENSG00000143727 17190 NearestLocation
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
# A pie chart can be used to demonstrate the overlap features of the peaks.
pie1(table(anno$insideFeature))
## Step 4: Add additional annotation with addGeneIDs
library(org.Hs.eg.db)
anno <- addGeneIDs(anno, orgAnn="org.Hs.eg.db",
feature_id_type="ensembl_gene_id",
IDs2Add=c("symbol"))
head(anno)
## GRanges object with 6 ranges and 14 metadata columns:
## seqnames ranges strand | score
## <Rle> <IRanges> <Rle> | <integer>
## peak12338.ENSG00000227061 chr2 175473-176697 * | 206
## peak12339.ENSG00000143727 chr2 246412-246950 * | 31
## peak12340.ENSG00000143727 chr2 249352-250233 * | 195
## peak12341.ENSG00000143727 chr2 259896-261404 * | 510
## peak12342.ENSG00000143727 chr2 261931-263148 * | 48
## peak12343.ENSG00000236856 chr2 378232-378871 * | 132
## signalValue pValue qValue peak
## <numeric> <numeric> <numeric> <character>
## peak12338.ENSG00000227061 668.42 -1 -1 peak12338
## peak12339.ENSG00000143727 100.23 -1 -1 peak12339
## peak12340.ENSG00000143727 630.65 -1 -1 peak12340
## peak12341.ENSG00000143727 1649.19 -1 -1 peak12341
## peak12342.ENSG00000143727 155.56 -1 -1 peak12342
## peak12343.ENSG00000236856 426.52 -1 -1 peak12343
## feature start_position end_position
## <character> <integer> <integer>
## peak12338.ENSG00000227061 ENSG00000227061 197569 202605
## peak12339.ENSG00000143727 ENSG00000143727 264140 278283
## peak12340.ENSG00000143727 ENSG00000143727 264140 278283
## peak12341.ENSG00000143727 ENSG00000143727 264140 278283
## peak12342.ENSG00000143727 ENSG00000143727 264140 278283
## peak12343.ENSG00000236856 ENSG00000236856 388412 416885
## feature_strand insideFeature distancetoFeature
## <character> <character> <numeric>
## peak12338.ENSG00000227061 + upstream -22096
## peak12339.ENSG00000143727 + upstream -17728
## peak12340.ENSG00000143727 + upstream -14788
## peak12341.ENSG00000143727 + upstream -4244
## peak12342.ENSG00000143727 + upstream -2209
## peak12343.ENSG00000236856 + upstream -10180
## shortestDistance fromOverlappingOrNearest
## <integer> <character>
## peak12338.ENSG00000227061 20872 NearestLocation
## peak12339.ENSG00000143727 17190 NearestLocation
## peak12340.ENSG00000143727 13907 NearestLocation
## peak12341.ENSG00000143727 2736 NearestLocation
## peak12342.ENSG00000143727 992 NearestLocation
## peak12343.ENSG00000236856 9541 NearestLocation
## symbol
## <character>
## peak12338.ENSG00000227061 <NA>
## peak12339.ENSG00000143727 ACP1
## peak12340.ENSG00000143727 ACP1
## peak12341.ENSG00000143727 ACP1
## peak12342.ENSG00000143727 ACP1
## peak12343.ENSG00000236856 <NA>
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
This section demonstrates how to annotate the same peak data as in quick start 1 using a new annotation based on TxDb with toGRanges
.
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene)
annoData[1:2]
## GRanges object with 2 ranges and 0 metadata columns:
## seqnames ranges strand
## <Rle> <IRanges> <Rle>
## 1 chr19 58858172-58874214 -
## 10 chr8 18248755-18258723 +
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
seqlevelsStyle(peaks) <- seqlevelsStyle(annoData)
The same annotatePeakInBatch
function is used to annotate the peaks using annotation data just created. This time we want the peaks within 2kb upstream and up to 300bp downstream of TSS within the gene body.
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData,
output="overlapping",
FeatureLocForDistance="TSS",
bindingRegion=c(-2000, 300))
anno$symbol <- xget(anno$feature, org.Hs.egSYMBOL)
head(anno)
## GRanges object with 6 ranges and 12 metadata columns:
## seqnames ranges strand | score signalValue pValue
## <Rle> <IRanges> <Rle> | <integer> <numeric> <numeric>
## peak12342 chr2 261931-263148 * | 48 155.56 -1
## peak12345 chr2 677052-677862 * | 103 334.74 -1
## peak12348 chr2 3380709-3382315 * | 110 357.22 -1
## peak12348 chr2 3380709-3382315 * | 110 357.22 -1
## peak12349 chr2 3383131-3384541 * | 199 645.56 -1
## peak12349 chr2 3383131-3384541 * | 199 645.56 -1
## qValue peak feature feature.ranges feature.strand
## <numeric> <character> <character> <IRanges> <Rle>
## peak12342 -1 peak12342 52 264869-278282 +
## peak12345 -1 peak12345 129787 667973-677439 -
## peak12348 -1 peak12348 7260 3192741-3381653 -
## peak12348 -1 peak12348 51112 3383446-3488857 +
## peak12349 -1 peak12349 7260 3192741-3381653 -
## peak12349 -1 peak12349 51112 3383446-3488857 +
## distance insideFeature distanceToSite symbol
## <integer> <character> <integer> <character>
## peak12342 1720 upstream 1720 ACP1
## peak12345 0 overlapStart 0 TMEM18
## peak12348 0 overlapStart 0 EIPR1
## peak12348 1130 upstream 1130 TRAPPC12
## peak12349 1477 upstream 1477 EIPR1
## peak12349 0 overlapStart 0 TRAPPC12
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
This section demonstrates the flexibility of the annotaition functions in the ChIPpeakAnno. Instead of building a new annotation data, the argument bindingTypes and bindingRegion in annoPeak
function can be set to find the peaks within 5000 bp upstream and downstream of the TSS, which could be the user defined promoter region.
anno <- annotatePeakInBatch(peaks, AnnotationData=annoData,
output="nearestBiDirectionalPromoters",
bindingRegion=c(-5000, 500))
anno$symbol <- xget(anno$feature, org.Hs.egSYMBOL)
anno[anno$peak=="peak12725"]
## GRanges object with 2 ranges and 12 metadata columns:
## seqnames ranges strand | score signalValue pValue
## <Rle> <IRanges> <Rle> | <integer> <numeric> <numeric>
## peak12725 chr2 28112981-28113476 * | 34 110.72 -1
## peak12725 chr2 28112981-28113476 * | 34 110.72 -1
## qValue peak feature feature.ranges feature.strand
## <numeric> <character> <character> <IRanges> <Rle>
## peak12725 -1 peak12725 9577 28113482-28561767 +
## peak12725 -1 peak12725 64080 28004266-28113223 -
## distance insideFeature distanceToSite symbol
## <integer> <character> <integer> <character>
## peak12725 5 upstream 5 BABAM2
## peak12725 0 overlapStart 0 RBKS
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
The annotated peaks can be visualized with R/Bioconductor package trackViewer developed by our group.
library(trackViewer)
gr <- peak <- peaks["peak12725"]
start(gr) <- start(gr) - 5000
end(gr) <- end(gr) + 5000
if(.Platform$OS.type != "windows"){
peak12725 <- importScore(file=system.file("extdata", "Tead4.bigWig",
package="ChIPpeakAnno"),
ranges=peak, format = "BigWig")
}else{## rtracklayer can not import bigWig files on Windows
load(file.path(dirname(path), "cvglist.rds"))
peak12725 <- Views(cvglists[["Tead4"]][[as.character(seqnames(peak))]],
start(peak),
end(peak))
peak12725 <- viewApply(peak12725, as.numeric)
tmp <- rep(peak, width(peak))
width(tmp) <- 1
tmp <- shift(tmp, shift=0:(width(peak)-1))
mcols(tmp) <- peak12725
colnames(mcols(tmp)) <- "score"
peak12725 <- new("track", dat=tmp,
name="peak12725",
type="data",
format="BED")
}
trs <- geneModelFromTxdb(TxDb.Hsapiens.UCSC.hg19.knownGene,
org.Hs.eg.db, gr)
names(trs) <- paste(sapply(trs, function(.ele) .ele@name), names(trs), sep=":")
optSty <- optimizeStyle(trackList(peak12725, trs, heightDist = c(.3, .7)),
theme="bw")
viewTracks(optSty$tracks, gr=gr, viewerStyle=optSty$style)