## ----setup, echo=FALSE, results="hide"---------------------------------------- knitr::opts_chunk$set(tidy = FALSE, cache = FALSE, dev = "png", message = FALSE, error = FALSE, warning = TRUE) ## ---- echo=FALSE, results="hide", warning=FALSE------------------------------- suppressPackageStartupMessages({ library(GenomicRanges) library(GenomicAlignments) library(rtracklayer) library(ggplot2) library(tidyr) library(ComplexHeatmap) library(BindingSiteFinder) library(forcats) library(dplyr) }) ## ----BiocManager, eval=FALSE-------------------------------------------------- # if (!require("BiocManager")) # install.packages("BiocManager") # BiocManager::install("BindingSiteFinder") ## ----------------------------------------------------------------------------- library(rtracklayer) csFile <- system.file("extdata", "PureCLIP_crosslink_sites_example.bed", package="BindingSiteFinder") cs = import(con = csFile, format = "BED") cs ## ---- fig.retina = 1, dpi = 100----------------------------------------------- library(ggplot2) quants = quantile(cs$score, probs = seq(0,1, by = 0.05)) csFilter = cs[cs$score >= quants[2]] ggplot(data = data.frame(score = cs$score), aes(x = log2(score))) + geom_histogram(binwidth = 0.5) + geom_vline(xintercept = log2(quants[2])) + theme_classic() + xlab("PureCLIP score (log2)") + ylab("Count") ## ----------------------------------------------------------------------------- # Load clip signal files and define meta data object files <- system.file("extdata", package="BindingSiteFinder") clipFilesP <- list.files(files, pattern = "plus.bw$", full.names = TRUE) clipFilesM <- list.files(files, pattern = "minus.bw$", full.names = TRUE) meta = data.frame( id = c(1,2,3,4), condition = factor(c("WT", "WT", "KD", "KD"), levels = c("KD", "WT")), clPlus = clipFilesP, clMinus = clipFilesM) meta ## ---- eval=FALSE-------------------------------------------------------------- # library(BindingSiteFinder) # bds = BSFDataSetFromBigWig(ranges = csFilter, meta = meta, silent = TRUE) ## ----------------------------------------------------------------------------- exampleFile <- list.files(files, pattern = ".rda$", full.names = TRUE) load(exampleFile) bds ## ---- fig.retina = 1, dpi = 100----------------------------------------------- supportRatioPlot(bds, bsWidths = seq(from = 3, to = 19, by = 2)) ## ----------------------------------------------------------------------------- bds1 <- makeBindingSites(object = bds, bsSize = 3, minWidth = 3, minCrosslinks = 2, minClSites = 1) bds2 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3, minCrosslinks = 2, minClSites = 1) bds3 <- makeBindingSites(object = bds, bsSize = 19, minWidth = 3, minCrosslinks = 2, minClSites = 1) bds4 <- makeBindingSites(object = bds, bsSize = 29, minWidth = 3, minCrosslinks = 2, minClSites = 1) l = list(`1. bsSize = 3` = bds1, `2. bsSize = 9` = bds2, `3. bsSize = 19` = bds3, `4. bsSize = 29` = bds4) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- rangeCoveragePlot(l, width = 20) ## ----------------------------------------------------------------------------- bds1 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3, minCrosslinks = 2, minClSites = 1) bds2 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3, minCrosslinks = 2, minClSites = 2) bds3 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3, minCrosslinks = 2, minClSites = 3) bds4 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3, minCrosslinks = 2, minClSites = 4) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- l = list(`1. minClSites = 1` = bds1, `2. minClSites = 2` = bds2, `3. minClSites = 3` = bds3, `4. minClSites = 4` = bds4) mergeSummaryPlot(l, select = "minClSites") ## ---- fig.retina = 1, dpi = 100----------------------------------------------- l = list(`1. minClSites = 1` = bds1, `2. minClSites = 2` = bds2, `3. minClSites = 3` = bds3, `4. minClSites = 4` = bds4) rangeCoveragePlot(l, width = 20) ## ----------------------------------------------------------------------------- bds10 <- makeBindingSites(object = bds, bsSize = 3, minWidth = 3, minCrosslinks = 1, minClSites = 1) bds20 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 3, minCrosslinks = 3, minClSites = 1) bds30 <- makeBindingSites(object = bds, bsSize = 19, minWidth = 3, minCrosslinks = 6, minClSites = 1) bds40 <- makeBindingSites(object = bds, bsSize = 29, minWidth = 3, minCrosslinks = 10, minClSites = 1) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.05)) reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.2)) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.05, 0.1)) reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = c(0.1, 0.05)) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- s1 = reproducibilityFilter(bds1, cutoff = c(0.05), n.reps = c(3), returnType = "data.frame") library(ComplexHeatmap) m1 = make_comb_mat(s1) UpSet(m1) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- s2 = reproducibilityFilter(bds1, cutoff = c(0.1, 0.05), n.reps = c(1, 2), returnType = "data.frame") m2 = make_comb_mat(s2) UpSet(m2) ## ----------------------------------------------------------------------------- bdsFinal = reproducibilityFilter(bds1, cutoff = c(0.1, 0.05), n.reps = c(1, 2)) getRanges(bdsFinal) ## ----------------------------------------------------------------------------- bdsFinal = annotateWithScore(bdsFinal, getRanges(bds)) bindingSites = getRanges(bdsFinal) bindingSites ## ---- eval=FALSE-------------------------------------------------------------- # library(GenomicFeatures) # # Make annotation database from gff3 file # annoFile = "./gencode.v37.annotation.chr22.header.gff3" # annoDb = makeTxDbFromGFF(file = annoFile, format = "gff3") # annoInfo = import(annoFile, format = "gff3") # # Get genes as GRanges # gns = genes(annoDb) # idx = match(gns$gene_id, annoInfo$gene_id) # elementMetadata(gns) = cbind(elementMetadata(gns), # elementMetadata(annoInfo)[idx,]) ## ----------------------------------------------------------------------------- # Load GRanges with genes geneFile <- list.files(files, pattern = "gns.rds$", full.names = TRUE) load(geneFile) gns ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # count the number of annotation overlaps mcols(bindingSites)$geneOl = factor(countOverlaps(bindingSites, gns)) df = as.data.frame(mcols(bindingSites)) # ggplot(df, aes(x = geneOl)) + geom_bar() + scale_y_log10() + geom_text(stat='count', aes(label=..count..), vjust=-0.3) + labs( title = "Overlapping gene annotation problem", caption = "Bar plot that show how many times a binding site overlaps with an annotation of a different gene.", x = "Number of overlaps", y = "Count (log10)" ) + theme_bw() ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # show which gene types cause the most annoation overlaps. Split binding sites # by the number of overlaps and remove those binding sites that do not overlap # with any annotation (intergenic) library(forcats) splitSites = split(bindingSites, bindingSites$geneOl) df = as(lapply(splitSites, function(x){subsetByOverlaps(gns,x)}), "GRangesList") df = df[2:length(df)] df = lapply(1:length(df), function(x) { data.frame(olClass = names(df[x]), geneType = factor(df[[x]]$gene_type))}) df = do.call(rbind,df) df = df %>% mutate(geneType = fct_lump_n(geneType, 4)) # ggplot(df, aes(x = olClass, fill = geneType)) + geom_bar(position = "fill") + scale_fill_brewer(palette = "Set2") + scale_y_continuous(labels = scales::percent) + labs( title = "Overlapping gene annotation causes", caption = "Percentage bar plot that show the fraction for each gene type and overlap number.", fill = "Gene type", x = "Number of overlaps", y = "Percent" ) + theme_bw() ## ----------------------------------------------------------------------------- # Remove all binding sites that overlap multiple gene annotations and retain # only protein coding genes and binding sites. bindingSites = subset(bindingSites, geneOl == 1) targetGenes = subsetByOverlaps(gns, bindingSites) targetGenes = subset(targetGenes, gene_type == "protein_coding") ## ---- eval=FALSE-------------------------------------------------------------- # # Count the overlaps of each binding site fore each part of the transcript. # cdseq = cds(annoDb) # intrns = unlist(intronsByTranscript(annoDb)) # utrs3 = unlist(threeUTRsByTranscript(annoDb)) # utrs5 = unlist(fiveUTRsByTranscript(annoDb)) # regions = list(CDS = cdseq, Intron = intrns, UTR3 = utrs3, UTR5 = utrs5) ## ----------------------------------------------------------------------------- # Load list with transcript regions regionFile <- list.files(files, pattern = "regions.rds$", full.names = TRUE) load(regionFile) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Count the overlaps of each binding site fore each part of the transcript. cdseq = regions$CDS %>% countOverlaps(bindingSites,.) intrns = regions$Intron %>% countOverlaps(bindingSites,.) utrs3 = regions$UTR3 %>% countOverlaps(bindingSites,.) utrs5 = regions$UTR5 %>% countOverlaps(bindingSites,.) countDf = data.frame(CDS = cdseq, Intron = intrns, UTR3 = utrs3, UTR5 = utrs5) # Counting how many times an annotation is not present. df = data.frame(olClass = apply(countDf,1,function(x) length(x[x == 0]))) df = data.frame(type = rev(names(table(df))), count = as.vector(table(df))) ggplot(df, aes(x = type, y = count)) + geom_col() + scale_y_log10() + geom_text(aes(label = count), vjust=-0.3) + labs( title = "Overlapping transcript annotation crashes", caption = "Bar plot that show how many times a binding site overlaps with an annotation of a different transcript part", x = "Number of overlaps", y = "Count (log10)" ) + theme_bw() ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Count the number of binding sites within each transcript region, ignoring the # problem of overlapping annotations (counting binding sites multiple times). plotCountDf = countDf plotCountDf[plotCountDf > 1] = 1 df = plotCountDf %>% pivot_longer(everything()) %>% group_by(name) %>% summarize(count = sum(value), .groups = "drop") %>% mutate(name = as.factor(name)) %>% mutate(name = fct_relevel(name, "Intron", "UTR3", "CDS", "UTR5")) ggplot(df, aes(x = name, y = count)) + geom_col() + scale_y_log10() + xlab("Number of overlaps") + ylab("Count") + geom_text(aes(label = count), vjust=-0.3) + labs( title = "Binding sites per genomic region", caption = "Bar plot that shows the number of binding sites per genomic region.", x = "Genomic region", y = "Count (log10)" ) + theme_bw() ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Show how many annotation overlaps are caused by what transcript types. m = make_comb_mat(plotCountDf) UpSet(m) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Solve the overlapping transcript types problem with majority votes and # hierarchical rule. rule = c("Intron", "UTR3", "CDS", "UTR5") # Prepare dataframe for counting (reorder by rule, add intergenic counts) countDf = countDf[, rule] %>% as.matrix() %>% cbind.data.frame(., Intergenic = ifelse(rowSums(countDf) == 0, 1, 0) ) # Apply majority vote scheme regionName = colnames(countDf) region = apply(countDf, 1, function(x){regionName[which.max(x)]}) mcols(bindingSites)$region = region df = data.frame(Region = region) # ggplot(df, aes(x = fct_infreq(Region))) + geom_bar() + scale_y_log10() + geom_text(stat='count', aes(label=..count..), vjust=-0.3) + labs( title = "Binding sites per genomic region", caption = "Bar plot that shows the number of binding sites per genomic region.", x = "Genomic region", y = "Count (log10)" ) + theme_bw() ## ----------------------------------------------------------------------------- # Sum up the length of each transcript region with overlapping binding sites. cdsLen = regions$CDS %>% subsetByOverlaps(., bindingSites) %>% width() %>% sum() intrnsLen = regions$Intron %>% subsetByOverlaps(., bindingSites) %>% width() %>% sum() utrs3Len = regions$UTR3 %>% subsetByOverlaps(., bindingSites) %>% width() %>% sum() utrs5Len = regions$UTR5 %>% subsetByOverlaps(., bindingSites) %>% width() %>% sum() lenSum = c(cdsLen, intrnsLen, utrs3Len, utrs5Len) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Compute the relative enrichment per transcript region. bindingSites = subset(bindingSites, region != "Intergenic") df = data.frame(region = names(table(bindingSites$region)), count = as.vector(table(bindingSites$region))) df$regionLength = lenSum df$normalizedCount = df$count / df$regionLength # ggplot(df, aes(x = region, y = normalizedCount)) + geom_col(width = 0.5) + labs( title = "Relative enrichment per genomic region ", caption = "Bar plot of region count, divided by the summed length.", x = "Genomic region", y = "Relative enrichment", fill = "Genomic region" ) + theme_bw() ## ----------------------------------------------------------------------------- sessionInfo()