## ----setup, echo=FALSE, results="hide"---------------------------------------- knitr::opts_chunk$set(tidy = FALSE, cache = FALSE, dev = "png", message = FALSE, error = FALSE, warning = TRUE) ## ---- echo=FALSE, results="hide", warning=FALSE------------------------------- suppressPackageStartupMessages({ library(GenomicRanges) library(GenomicAlignments) library(rtracklayer) library(ggplot2) library(tidyr) library(ComplexHeatmap) library(BindingSiteFinder) library(forcats) library(dplyr) }) ## ----BiocManager, eval=FALSE-------------------------------------------------- # if (!require("BiocManager")) # install.packages("BiocManager") # BiocManager::install("BindingSiteFinder") ## ----------------------------------------------------------------------------- library(rtracklayer) csFile <- system.file("extdata", "PureCLIP_crosslink_sites_examples.bed", package="BindingSiteFinder") cs = import(con = csFile, format = "BED", extraCols=c("additionalScores" = "character")) cs$additionalScores = NULL cs ## ---- fig.retina = 1, dpi = 100----------------------------------------------- library(ggplot2) quants = quantile(cs$score, probs = seq(0,1, by = 0.05)) csFilter = cs[cs$score >= quants[2]] ggplot(data = data.frame(score = cs$score), aes(x = log2(score))) + geom_histogram(binwidth = 0.5) + geom_vline(xintercept = log2(quants[2])) + theme_classic() + xlab("PureCLIP score (log2)") + ylab("Count") ## ----------------------------------------------------------------------------- # Load clip signal files and define meta data object files <- system.file("extdata", package="BindingSiteFinder") clipFilesP <- list.files(files, pattern = "plus.bw$", full.names = TRUE) clipFilesM <- list.files(files, pattern = "minus.bw$", full.names = TRUE) meta = data.frame( id = c(1:4), condition = factor(rep("WT", 4)), clPlus = clipFilesP, clMinus = clipFilesM) meta ## ---- eval=FALSE-------------------------------------------------------------- # library(BindingSiteFinder) # bds = BSFDataSetFromBigWig(ranges = csFilter, meta = meta, silent = TRUE) ## ---- eval=TRUE, echo=FALSE--------------------------------------------------- exampleFile <- list.files(files, pattern = ".rda$", full.names = TRUE) load(exampleFile) meta = data.frame( id = c(1:4), condition = factor(rep("WT", 4)), clPlus = clipFilesP, clMinus = clipFilesM) bds@meta = meta names(bds@signal$signalPlus) = c("1_WT", "2_WT", "3_WT", "4_WT") names(bds@signal$signalMinus) = c("1_WT", "2_WT", "3_WT", "4_WT") ## ---- eval=FALSE-------------------------------------------------------------- # exampleFile <- list.files(files, pattern = ".rda$", full.names = TRUE) # load(exampleFile) # bds ## ---- fig.retina = 1, dpi = 100----------------------------------------------- supportRatioPlot(bds, bsWidths = seq(from = 3, to = 29, by = 2), minWidth = 2) ## ----------------------------------------------------------------------------- bds1 <- makeBindingSites(object = bds, bsSize = 3, minWidth = 2, minCrosslinks = 2, minClSites = 1) bds2 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 2, minCrosslinks = 2, minClSites = 1) bds3 <- makeBindingSites(object = bds, bsSize = 19, minWidth = 2, minCrosslinks = 2, minClSites = 1) bds4 <- makeBindingSites(object = bds, bsSize = 29, minWidth = 2, minCrosslinks = 2, minClSites = 1) l = list(`1. bsSize = 3` = bds1, `2. bsSize = 9` = bds2, `3. bsSize = 19` = bds3, `4. bsSize = 29` = bds4) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- rangeCoveragePlot(l, width = 20) ## ----------------------------------------------------------------------------- bds1 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 2, minCrosslinks = 2, minClSites = 1) bds2 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 2, minCrosslinks = 2, minClSites = 2) bds3 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 2, minCrosslinks = 2, minClSites = 3) bds4 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 2, minCrosslinks = 2, minClSites = 4) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- l = list(`1. minClSites = 1` = bds1, `2. minClSites = 2` = bds2, `3. minClSites = 3` = bds3, `4. minClSites = 4` = bds4) mergeSummaryPlot(l, select = "minClSites") ## ---- fig.retina = 1, dpi = 100----------------------------------------------- l = list(`1. minClSites = 1` = bds1, `2. minClSites = 2` = bds2, `3. minClSites = 3` = bds3, `4. minClSites = 4` = bds4) rangeCoveragePlot(l, width = 20) ## ----------------------------------------------------------------------------- bds10 <- makeBindingSites(object = bds, bsSize = 3, minWidth = 2, minCrosslinks = 1, minClSites = 1) bds20 <- makeBindingSites(object = bds, bsSize = 9, minWidth = 2, minCrosslinks = 3, minClSites = 1) bds30 <- makeBindingSites(object = bds, bsSize = 19, minWidth = 2, minCrosslinks = 6, minClSites = 1) bds40 <- makeBindingSites(object = bds, bsSize = 29, minWidth = 2, minCrosslinks = 10, minClSites = 1) ## ----------------------------------------------------------------------------- bds1 <- makeBindingSites(object = bds, bsSize = 7, minWidth = 2, minCrosslinks = 2, minClSites = 1) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- reproducibiliyCutoffPlot(bds1, max.range = 20, cutoff = 0.05) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- s1 = reproducibilityFilter(bds1, cutoff = 0.05, n.reps = 3, returnType = "data.frame") library(ComplexHeatmap) m1 = make_comb_mat(s1) UpSet(m1) ## ----------------------------------------------------------------------------- bdsFinal = reproducibilityFilter(bds1, cutoff = 0.05, n.reps = 3) getRanges(bdsFinal) ## ----------------------------------------------------------------------------- bdsFinal = annotateWithScore(bdsFinal, getRanges(bds)) # set nice binding site names finalRanges = getRanges(bdsFinal) names(finalRanges) = paste0("BS", seq_along(finalRanges)) bdsFinal = setRanges(bdsFinal, finalRanges) bindingSites = getRanges(bdsFinal) bindingSites ## ---- eval=FALSE-------------------------------------------------------------- # library(rtracklayer) # export(object = bindingSites, con = "./bindingSites.bed", format = "BED") ## ---- eval=FALSE-------------------------------------------------------------- # library(GenomicFeatures) # # Make annotation database from gff3 file # annoFile = "./gencode.v37.annotation.chr22.header.gff3" # annoDb = makeTxDbFromGFF(file = annoFile, format = "gff3") # annoInfo = import(annoFile, format = "gff3") # # Get genes as GRanges # gns = genes(annoDb) # idx = match(gns$gene_id, annoInfo$gene_id) # elementMetadata(gns) = cbind(elementMetadata(gns), # elementMetadata(annoInfo)[idx,]) ## ----------------------------------------------------------------------------- # Load GRanges with genes geneFile <- list.files(files, pattern = "gns.rds$", full.names = TRUE) load(geneFile) gns ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Count the number of annotation overlaps mcols(bindingSites)$geneOl = factor(countOverlaps(bindingSites, gns)) df = as.data.frame(mcols(bindingSites)) ggplot(df, aes(x = geneOl)) + geom_bar() + scale_y_log10() + geom_text(stat='count', aes(label=..count..), vjust=-0.3) + labs( title = "Overlapping gene annotation problem", subtitle = "Bar plot shows how many times a binding site overlaps with an annotated gene.", x = "Number of overlaps", y = "Count (log10)" ) + theme_bw() ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Show which gene types cause the most annotation overlaps. Split binding sites # by the number of overlaps and remove those binding sites that do not overlap # with any annotation (intergenic) library(forcats) idx = findOverlaps(gns, bindingSites) df = data.frame(geneType = gns$gene_type[queryHits(idx)], overlapType = bindingSites$geneOl[subjectHits(idx)]) %>% mutate(geneType = as.factor(geneType)) %>% mutate(geneType = fct_lump_n(geneType, 4)) %>% group_by(overlapType, geneType) %>% tally() ggplot(df, aes(x = overlapType, y = n, fill = geneType)) + geom_col(position = "fill") + scale_fill_brewer(palette = "Set2") + scale_y_continuous(labels = scales::percent) + labs( title = "Overlapping gene annotation causes", subtitle = "Percentage bar plot that show the fraction for each gene type and overlap number", fill = "Gene type", x = "Number of overlaps", y = "Percent" ) + theme_bw() ## ---- fig.retina = 1, dpi = 100----------------------------------------------- df = data.frame(geneType = gns$gene_type[queryHits(idx)], overlapType = bindingSites$geneOl[subjectHits(idx)]) %>% filter(overlapType == 1) %>% group_by(geneType) %>% tally() %>% arrange(n) %>% mutate(geneType = factor(geneType, levels = geneType)) ggplot(df, aes(x = geneType, y = n, fill = geneType, label = n)) + geom_col(color = "black") + scale_y_log10() + coord_flip() + scale_fill_brewer(palette = "Greys", direction = -1) + theme_bw() + theme(legend.position = "none") + geom_text(color = "blue") + labs( title = "Clean RBP target spectrum", x = "", y = "#N (log10)" ) ## ----------------------------------------------------------------------------- # Remove all binding sites that overlap multiple gene annotations and retain # only protein-coding genes and binding sites. bindingSites = subset(bindingSites, geneOl == 1) targetGenes = subsetByOverlaps(gns, bindingSites) targetGenes = subset(targetGenes, gene_type == "protein_coding") ## ---- eval=FALSE-------------------------------------------------------------- # # Count the overlaps of each binding site for each region of the transcript. # cdseq = cds(annoDb) # intrns = unlist(intronsByTranscript(annoDb)) # utrs3 = unlist(threeUTRsByTranscript(annoDb)) # utrs5 = unlist(fiveUTRsByTranscript(annoDb)) # regions = list(CDS = cdseq, Intron = intrns, UTR3 = utrs3, UTR5 = utrs5) ## ----------------------------------------------------------------------------- # Load list with transcript regions regionFile <- list.files(files, pattern = "regions.rds$", full.names = TRUE) load(regionFile) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Count the overlaps of each binding site fore each region of the transcript. cdseq = regions$CDS %>% countOverlaps(bindingSites,.) intrns = regions$Intron %>% countOverlaps(bindingSites,.) utrs3 = regions$UTR3 %>% countOverlaps(bindingSites,.) utrs5 = regions$UTR5 %>% countOverlaps(bindingSites,.) countDf = data.frame(CDS = cdseq, Intron = intrns, UTR3 = utrs3, UTR5 = utrs5) # Count how many times an annotation is not present. df = data.frame(olClass = apply(countDf,1,function(x) length(x[x == 0]))) df = data.frame(type = rev(names(table(df))), count = as.vector(table(df))) ggplot(df, aes(x = type, y = count)) + geom_col() + scale_y_log10() + geom_text(aes(label = count), vjust=-0.3) + labs( title = "Overlapping transcript annotation clashes", subtitle = "Bar plot shows how many times a binding site overlaps with an annotation of a different \ntranscript region.", x = "Number of overlaps", y = "Count (log10)" ) + theme_bw() ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Count the number of binding sites within each transcript region, ignoring the # problem of overlapping annotations (counting binding sites multiple times). plotCountDf = countDf plotCountDf[plotCountDf > 1] = 1 df = plotCountDf %>% pivot_longer(everything()) %>% group_by(name) %>% summarize(count = sum(value), .groups = "drop") %>% arrange(count) %>% mutate(name = factor(name, levels = name)) ggplot(df, aes(x = name, y = count, fill = name)) + geom_col(color = "black") + scale_y_log10() + coord_flip() + scale_fill_brewer(palette = "Greys", direction = -1) + xlab("Number of overlaps") + ylab("Count") + geom_text(aes(label = count), color = "blue") + labs( title = "Binding sites per transcript region - unresolved annotation overlaps", subtitle = "Bar plot that shows the number of binding sites per transcript region.", x = "Transcript region", y = "#N (log10)" ) + theme_bw() + theme(legend.position = "none") ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Show how many annotation overlaps are caused by what transcript regions m = make_comb_mat(plotCountDf) UpSet(m) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Solve the overlapping transcript region problem with majority votes and # hierarchical rule. rule = c("Intron", "UTR3", "CDS", "UTR5") # Prepare dataframe for counting (reorder by rule, add intergenic counts) countDf = countDf[, rule] %>% as.matrix() %>% cbind.data.frame(., Intergenic = ifelse(rowSums(countDf) == 0, 1, 0) ) # Apply majority vote scheme regionName = colnames(countDf) region = apply(countDf, 1, function(x){regionName[which.max(x)]}) mcols(bindingSites)$region = region df = data.frame(Region = region) %>% group_by(Region) %>% tally() %>% arrange(n) %>% mutate(Region = factor(Region, levels = Region)) ggplot(df, aes(x = Region, y = n, fill = Region)) + geom_col(color = "black") + scale_y_log10() + coord_flip() + scale_fill_brewer(palette = "Greys", direction = -1) + xlab("Number of overlaps") + ylab("Count") + geom_text(aes(label = n), color = "blue") + labs( title = "Binding sites per transcript region - resolved annotation overlaps", subtitle = "Bar plot that shows the number of binding sites per transcript region.", x = "Transcript region", y = "#N (log10)" ) + theme_bw() + theme(legend.position = "none") ## ----------------------------------------------------------------------------- # Function that selects the first hosting region of each binding site and then # sums up the lengths from all of them getRegionLengthByFirstMatch <- function(region, bindingsites) { # region = regions$Intron # bindingsites = bindingSites idx = findOverlaps(region, bindingsites) %>% as.data.frame() %>% group_by(subjectHits) %>% arrange(queryHits) %>% filter(row_number() == 1) len = region[idx$queryHits] %>% width() %>% sum() return(len) } regionLength = lapply(regions, function(x){ getRegionLengthByFirstMatch(x, bindingSites) }) %>% unlist() %>% as.data.frame() ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # Compute the relative enrichment per transcript region. bindingSites = subset(bindingSites, region != "Intergenic") df = as.data.frame(mcols(bindingSites)) %>% dplyr::select(region) %>% group_by(region) %>% tally() %>% mutate(regionLength = regionLength[,1]) %>% mutate(normalizedCount = n / regionLength) %>% arrange(normalizedCount) %>% mutate(region = factor(region, levels = region)) # ggplot(df, aes(x = region, y = normalizedCount, fill = region)) + geom_col(color = "black") + coord_flip() + scale_fill_brewer(palette = "Greys", direction = -1) + labs( title = "Relative enrichment per transcript region ", subtitle = "Bar plot of region count, divided by the summed length.", x = "Transcript region", y = "Relative enrichment", fill = "Transcript region" ) + theme_bw() + theme(legend.position = "none") ## ---- fig.retina = 1, dpi = 100----------------------------------------------- set.seed(1234) bdsSub = bds[sample(seq_along(getRanges(bds)), 100, replace = FALSE)] cov = coverageOverRanges(bdsSub, returnOptions = "merge_positions_keep_replicates") df = mcols(cov) %>% as.data.frame() %>% pivot_longer(everything()) ggplot(df, aes(x = name, y = log2(value+1), fill = name)) + geom_violin() + geom_boxplot(width = 0.1, fill = "white") + scale_fill_brewer(palette = "Greys") + theme_bw() + theme(legend.position = "none") + labs(x = "Samples", y = "#Crosslinks (log2)") ## ---- fig.retina = 1, dpi = 100----------------------------------------------- # candidateGenes = gns[sample(seq_along(gns), 10, replace = FALSE)] protGenes = gns[gns$gene_type == "protein_coding"] lncGenes = gns[gns$gene_type == "lncRNA"] calcCoverageForGeneSet <- function(geneSet, bsfObj) { idx = findOverlaps(geneSet, getRanges(bsfObj)) currBsfObj = bsfObj[subjectHits(idx)] cov = coverageOverRanges(currBsfObj, returnOptions = "merge_positions_keep_replicates") df = mcols(cov) %>% as.data.frame() %>% pivot_longer(everything()) return(df) } dfProtGenes = calcCoverageForGeneSet(geneSet = protGenes, bsfObj = bds) %>% cbind.data.frame(geneType = "protein coding") dfLncGenes = calcCoverageForGeneSet(geneSet = lncGenes, bsfObj = bds) %>% cbind.data.frame(geneType = "lncRNA") df = rbind(dfProtGenes, dfLncGenes) ggplot(df, aes(x = name, y = log2(value+1), fill = geneType)) + geom_boxplot() + scale_fill_brewer(palette = "Greys") + theme_bw() + labs(x = "Samples", y = "#Crosslinks (log2)") ## ---- fig.retina = 1, dpi = 100----------------------------------------------- bdsMerge = collapseReplicates(bds)[1:100] covTotal = coverageOverRanges(bdsMerge, returnOptions = "merge_positions_keep_replicates") covRep = coverageOverRanges(bds[1:100], returnOptions = "merge_positions_keep_replicates") df = cbind.data.frame(mcols(covTotal), mcols(covRep)) %>% mutate(rep1 = round(`1_WT`/ WT, digits = 2) * 100, rep2 = round(`2_WT`/ WT, digits = 2) * 100, rep3 = round(`3_WT`/ WT, digits = 2) * 100, rep4 = round(`4_WT`/ WT, digits = 2) * 100) %>% tibble::rowid_to_column("BsID") %>% dplyr::select(BsID, rep1, rep2, rep3, rep4) %>% pivot_longer(-BsID) %>% group_by(BsID) %>% arrange(desc(value), .by_group = TRUE) %>% mutate(name = factor(name, levels = name)) %>% group_by(name) %>% arrange(desc(value), .by_group = TRUE) %>% mutate(BsID = factor(BsID, levels = BsID)) ggplot(df, aes(x = BsID, y = value, fill = name)) + geom_col(position = "fill", width = 1) + theme_bw() + scale_fill_brewer(palette = "Set3") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 7)) + labs(x = "Binding site ID", y = "Percentage", fill = "Replicate" ) + scale_y_continuous(labels = scales::percent) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- bindingSiteCoveragePlot(bdsFinal, plotIdx = 8, flankPos = 100, autoscale = TRUE) bindingSiteCoveragePlot(bdsFinal, plotIdx = 8, flankPos = 100, mergeReplicates = TRUE) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- rangesBeforeRepFilter = getRanges(bds1) rangesAfterRepFilter = getRanges(bdsFinal) idx = which(!match(rangesBeforeRepFilter, rangesAfterRepFilter, nomatch = 0) > 0) rangesRemovedByFilter = rangesBeforeRepFilter[idx] bdsRemovedRanges = setRanges(bds1, rangesRemovedByFilter) bindingSiteCoveragePlot(bdsRemovedRanges, plotIdx = 2, flankPos = 50) bindingSiteCoveragePlot(bdsRemovedRanges, plotIdx = 2, flankPos = 50, mergeReplicates = TRUE) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- pSites = getRanges(bds) bindingSiteCoveragePlot(bdsFinal, plotIdx = 8, flankPos = 20, autoscale = TRUE, customRange = pSites, customRange.name = "pSites", shiftPos = -10) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- bindingSiteCoverage = coverageOverRanges(bdsFinal, returnOptions = "merge_positions_keep_replicates") idxMaxCountsBs = which.max(rowSums(as.data.frame(mcols(bindingSiteCoverage)))) bindingSiteCoveragePlot(bdsFinal, plotIdx = idxMaxCountsBs, flankPos = 100, mergeReplicates = FALSE, shiftPos = 50) ## ----------------------------------------------------------------------------- bdsCDS = setRanges(bdsFinal, regions$CDS) cdsWithMostBs = which.max(countOverlaps(regions$CDS, getRanges(bdsFinal))) bindingSiteCoveragePlot(bdsCDS, plotIdx = cdsWithMostBs, showCentralRange = FALSE, flankPos = -250, shiftPos = -50, mergeReplicates = TRUE, highlight = FALSE, customRange = getRanges(bdsFinal)) bindingSiteCoveragePlot(bdsCDS, plotIdx = cdsWithMostBs, showCentralRange = FALSE, flankPos = -250, shiftPos = -50, mergeReplicates = TRUE, highlight = FALSE, customRange = getRanges(bdsFinal), customAnnotation = regions$CDS) ## ---- eval=FALSE-------------------------------------------------------------- # library(rtracklayer) # rangesBeforeRepFilter = getRanges(bds1) # rangesAfterRepFilter = getRanges(bdsFinal) # export(object = rangesBeforeRepFilter, con = "./rangesBeforeRepFilter.bed", format = "BED") # export(object = rangesAfterRepFilter, con = "./rangesAfterRepFilter.bed", format = "BED") ## ---- eval=FALSE-------------------------------------------------------------- # sgn = getSignal(bds) # export(sgn$signalPlus$`1_WT`, con = "./WT_1_plus.bw", format = "bigwig") # export(sgn$signalMinus$`1_WT`, con = "./WT_1_minus.bw", format = "bigwig") ## ---- eval=FALSE-------------------------------------------------------------- # bdsMerge = collapseReplicates(bds) # sgn = getSignal(bdsMerge) # export(sgn$signalPlus$WT, con = "./sgn_plus.bw", format = "bigwig") # export(sgn$signalPlus$WT, con = "./sgn_minus.bw", format = "bigwig") ## ----------------------------------------------------------------------------- # Set artificial KD condition metaCond = getMeta(bdsFinal) metaCond$condition = factor(c(rep("WT", 2), rep("KD", 2)), levels = c("WT", "KD")) bdsCond = setMeta(bdsFinal, metaCond) # Fix replicate names in signal namesCond = c("1_WT", "2_WT", "3_KD", "4_KD") sgn = getSignal(bdsCond) names(sgn$signalPlus) = namesCond names(sgn$signalMinus) = namesCond bdsCond = setSignal(bdsCond, sgn) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- reproducibiliyCutoffPlot(bdsCond, max.range = c(20), n.reps = c(2,2), cutoff = c(0.1, 0.1)) reproducibiliyCutoffPlot(bdsCond, max.range = c(20), n.reps = c(2,2), cutoff = c(0.1, 0.05)) ## ---- eval=FALSE-------------------------------------------------------------- # bdsCond = reproducibilityFilter(bdsCond, n.reps = c(2,2), cutoff = c(0.1, 0.05)) ## ---- fig.retina = 1, dpi = 100----------------------------------------------- bindingSiteCoverage = coverageOverRanges(bdsCond, returnOptions = "merge_positions_keep_replicates") idxMaxCountsBs = which.max(rowSums(as.data.frame(mcols(bindingSiteCoverage)))) bindingSiteCoveragePlot(bdsCond, plotIdx = idxMaxCountsBs, flankPos = 100, mergeReplicates = FALSE, shiftPos = 50) ## ----------------------------------------------------------------------------- sessionInfo()