## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, out.width = "80%", fig.align = "center", crop = NULL # suppress "The magick package is required to crop" issue ) library(BiocStyle) ## ---- eval=FALSE-------------------------------------------------------------- # if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # # BiocManager::install("monaLisa") ## ---- quick, eval=FALSE------------------------------------------------------- # # load package # library(monaLisa) # # # bin regions # # - peak_change is a numerical vector # # - peak_change needs to be created by the user to run this code # peak_bins <- bin(x = peak_change, binmode = "equalN", nElement = 400) # # # calculate motif enrichments # # - peak_seqs is a DNAStringSet, pwms is a PWMatrixList # # - peak_seqs and pwms need to be created by the user to run this code # se <- calcBinnedMotifEnrR(seqs = peak_seqs, # bins = peak_bins, # pwmL = pwms) ## ----loadlib, message=FALSE--------------------------------------------------- library(GenomicRanges) library(SummarizedExperiment) library(JASPAR2020) library(TFBSTools) library(BSgenome.Mmusculus.UCSC.mm10) library(monaLisa) library(ComplexHeatmap) library(circlize) ## ----loadLMRs----------------------------------------------------------------- lmrfile <- system.file("extdata", "LMRsESNPmerged.gr.rds", package = "monaLisa") lmr <- readRDS(lmrfile) lmr ## ----alternativeInputs, eval=FALSE-------------------------------------------- # # starting from a bed file # # import as `GRanges` using `rtracklayer::import` # # remark: if the bed file also contains scores (5th column), these will be # # also be imported and available in the "score" metadata column, # # in this example in `lmr$score` # lmr <- rtracklayer::import(con = "file.bed", format = "bed") # # # starting from sequences in a FASTA file # # import as `DNAStringSet` using `Biostrings::readDNAStringSet` # # remark: contrary to the coordinates in a `GRanges` object like `lmr` above, # # the sequences in `lmrseqs` can be directly used as input to # # monaLisa::calcBinnedMotifEnrR (no need to extract sequences from # # the genome, just skip that step below) # lmrseqs <- Biostrings::readDNAStringSet(filepath = "myfile.fa", format = "fasta") ## ----deltameth---------------------------------------------------------------- hist(lmr$deltaMeth, 100, col = "gray", main = "", xlab = "Change of methylation (NP - ES)", ylab = "Number of LMRs") ## ----lmrsel------------------------------------------------------------------- set.seed(1) lmrsel <- lmr[ sample(x = length(lmr), size = 10000, replace = FALSE) ] ## ----binlmrs------------------------------------------------------------------ bins <- bin(x = lmrsel$deltaMeth, binmode = "equalN", nElement = 800, minAbsX = 0.3) table(bins) ## ----------------------------------------------------------------------------- # find the index of the level representing the zero bin levels(bins) getZeroBin(bins) ## ----plotbins----------------------------------------------------------------- plotBinDensity(lmrsel$deltaMeth, bins, legend = "topleft") ## ----getmotifs---------------------------------------------------------------- pwms <- getMatrixSet(JASPAR2020, opts = list(matrixtype = "PWM", tax_group = "vertebrates")) ## ----makewidthequal----------------------------------------------------------- summary(width(lmrsel)) lmrsel <- trim(resize(lmrsel, width = median(width(lmrsel)), fix = "center")) summary(width(lmrsel)) ## ----getseqs------------------------------------------------------------------ lmrseqs <- getSeq(BSgenome.Mmusculus.UCSC.mm10, lmrsel) ## ----bindiag------------------------------------------------------------------ plotBinDiagnostics(seqs = lmrseqs, bins = bins, aspect = "GCfrac") plotBinDiagnostics(seqs = lmrseqs, bins = bins, aspect = "dinucfreq") ## ----runbinned, eval=FALSE---------------------------------------------------- # se <- calcBinnedMotifEnrR(seqs = lmrseqs, bins = bins, pwmL = pwms) ## ----getresults--------------------------------------------------------------- se <- readRDS(system.file("extdata", "results.binned_motif_enrichment_LMRs.rds", package = "monaLisa")) ## ----summarizedexperiment----------------------------------------------------- # summary se dim(se) # motifs-by-bins # motif info rowData(se) head(rownames(se)) # bin info colData(se) head(colnames(se)) # assays: the motif enrichment results assayNames(se) assay(se, "log2enr")[1:5, 1:3] ## ----plottfs, fig.height=10--------------------------------------------------- # select strongly enriched motifs sel <- apply(assay(se, "negLog10Padj"), 1, function(x) max(abs(x), 0, na.rm = TRUE)) > 4.0 sum(sel) seSel <- se[sel, ] # plot plotMotifHeatmaps(x = seSel, which.plots = c("log2enr", "negLog10Padj"), width = 2.0, cluster = TRUE, maxEnr = 2, maxSig = 10, show_motif_GC = TRUE) ## ----selsigvariants----------------------------------------------------------- # significantly enriched in bin 8 levels(bins)[8] sel.bin8 <- assay(se, "negLog10Padj")[, 8] > 4.0 sum(sel.bin8, na.rm = TRUE) # significantly enriched in any "non-zero" bin getZeroBin(bins) sel.nonZero <- apply( assay(se, "negLog10Padj")[, -getZeroBin(bins), drop = FALSE], 1, function(x) max(abs(x), 0, na.rm = TRUE)) > 4.0 sum(sel.nonZero) ## ----wmclustering------------------------------------------------------------- SimMatSel <- motifSimilarity(rowData(seSel)$motif.pfm) range(SimMatSel) ## ----plottfsclustered, fig.height=10, fig.width=8----------------------------- # create hclust object, similarity defined by 1 - Pearson correlation hcl <- hclust(as.dist(1 - SimMatSel), method = "average") plotMotifHeatmaps(x = seSel, which.plots = c("log2enr", "negLog10Padj"), width = 1.8, cluster = hcl, maxEnr = 2, maxSig = 10, show_dendrogram = TRUE, show_seqlogo = TRUE, width.seqlogo = 1.2) ## ----------------------------------------------------------------------------- # get PFMs from JASPAR2020 package (vertebrate subset) pfms <- getMatrixSet(JASPAR2020, opts = list(matrixtype = "PFM", tax_group = "vertebrates")) # convert PFMs to PWMs pwms <- toPWM(pfms) # convert JASPAR2020 PFMs (vertebrate subset) to Homer motif file tmp <- tempfile() convert <- dumpJaspar(filename = tmp, pkg = "JASPAR2020", pseudocount = 0, opts = list(tax_group = "vertebrates")) # convert Homer motif file to PFMatrixList pfms_ret <- homerToPFMatrixList(filename = tmp, n = 100L) # compare the first PFM # - notice the different magnitude of counts (controlled by `n`) # - notice that with the default (recommended) value of `pseudocount = 1.0`, # there would be no zero values in pfms_ret matrices, making # pfms and pfms_ret even more different as.matrix(pfms[[1]]) as.matrix(pfms_ret[[1]]) # compare position probability matrices with the original PFM round(sweep(x = as.matrix(pfms[[1]]), MARGIN = 2, STATS = colSums(as.matrix(pfms[[1]])), FUN = "/"), 3) round(sweep(x = as.matrix(pfms_ret[[1]]), MARGIN = 2, STATS = colSums(as.matrix(pfms_ret[[1]])), FUN = "/"), 3) ## ----defineSetsBinary--------------------------------------------------------- lmr.unchanged <- lmrsel[abs(lmrsel$deltaMeth) < 0.05] length(lmr.unchanged) lmr.up <- lmrsel[lmrsel$deltaMeth > 0.6] length(lmr.up) ## ----fuseSetsBinary----------------------------------------------------------- # combine the two sets or genomic regions lmrsel2 <- c(lmr.unchanged, lmr.up) # extract sequences from the genome lmrseqs2 <- getSeq(BSgenome.Mmusculus.UCSC.mm10, lmrsel2) ## ----createBins2-------------------------------------------------------------- # define binning vector bins2 <- rep(c("unchanged", "up"), c(length(lmr.unchanged), length(lmr.up))) bins2 <- factor(bins2) table(bins2) ## ----enrBinary---------------------------------------------------------------- se2 <- calcBinnedMotifEnrR(seqs = lmrseqs2, bins = bins2, pwmL = pwms[rownames(seSel)]) se2 ## ----plotBinary, fig.height=6, fig.width=8------------------------------------ sel2 <- apply(assay(se2, "negLog10Padj"), 1, function(x) max(abs(x), 0, na.rm = TRUE)) > 4.0 sum(sel2) plotMotifHeatmaps(x = se2[sel2,], which.plots = c("log2enr", "negLog10Padj"), width = 1.8, cluster = TRUE, maxEnr = 2, maxSig = 10, show_seqlogo = TRUE) ## ----singleBinSeqs------------------------------------------------------------ lmrseqs3 <- lmrseqs[bins == levels(bins)[1]] length(lmrseqs3) se3 <- calcBinnedMotifEnrR(seqs = lmrseqs3, pwmL = pwms[rownames(seSel)], background = "genome", genome = BSgenome.Mmusculus.UCSC.mm10, genome.regions = NULL, # sample from full genome genome.oversample = 2, BPPARAM = BiocParallel::SerialParam(RNGseed = 42), verbose = TRUE) ## ----singleBinResult---------------------------------------------------------- ncol(se3) ## ----plotSingleBin, fig.height=8, fig.width=8--------------------------------- sel3 <- assay(se3, "negLog10Padj")[, 1] > 4.0 sum(sel3) plotMotifHeatmaps(x = se3[sel3,], which.plots = c("log2enr", "negLog10Padj"), width = 1.8, maxEnr = 2, maxSig = 10, show_seqlogo = TRUE) # analyzed HOX motifs grep("HOX", rowData(se3)$motif.name, value = TRUE) # significant HOX motifs grep("HOX", rowData(se3)$motif.name[sel3], value = TRUE) ## ----compareGenomeVsBinned, fig.width=7, fig.height=6------------------------- cols <- rep("gray", nrow(se3)) cols[grep("HOX", rowData(se3)$motif.name)] <- "#DF536B" cols[grep("KLF|Klf", rowData(se3)$motif.name)] <- "#61D04F" par(mar = c(5, 5, 2, 2) + .1, mgp = c(1.75, 0.5, 0), cex = 1.25) plot(assay(seSel, "log2enr")[,1], assay(se3, "log2enr")[,1], col = cols, pch = 20, asp = 1, xlab = "Versus other bins (log2 enr)", ylab = "Versus genome (log2 enr)") legend("topleft", c("HOX family","KLF family","other"), pch = 20, bty = "n", col = c("#DF536B", "#61D04F", "gray")) abline(a = 0, b = 1) abline(h = 0, lty = 3) abline(v = 0, lty = 3) ## ----binnedkmerenr, eval=FALSE------------------------------------------------ # sekm <- calcBinnedKmerEnr(seqs = lmrseqs, bins = bins, kmerLen = 6, # includeRevComp = TRUE) ## ----binnedkmerenr-load------------------------------------------------------- sekm <- readRDS(system.file( "extdata", "results.binned_6mer_enrichment_LMRs.rds", package = "monaLisa" )) ## ----------------------------------------------------------------------------- sekm ## ----------------------------------------------------------------------------- selkm <- apply(assay(sekm, "negLog10Padj"), 1, function(x) max(abs(x), 0, na.rm = TRUE)) > 4 sum(selkm) sekmSel <- sekm[selkm, ] ## ---- fig.width=10, fig.height=10--------------------------------------------- pfmSel <- rowData(seSel)$motif.pfm sims <- motifKmerSimilarity(x = pfmSel, kmers = rownames(sekmSel), includeRevComp = TRUE) dim(sims) maxwidth <- max(sapply(TFBSTools::Matrix(pfmSel), ncol)) seqlogoGrobs <- lapply(pfmSel, seqLogoGrob, xmax = maxwidth) hmSeqlogo <- rowAnnotation(logo = annoSeqlogo(seqlogoGrobs, which = "row"), annotation_width = unit(1.5, "inch"), show_annotation_name = FALSE ) Heatmap(sims, show_row_names = TRUE, row_names_gp = gpar(fontsize = 8), show_column_names = TRUE, column_names_gp = gpar(fontsize = 8), name = "Similarity", column_title = "Selected TFs and enriched k-mers", col = colorRamp2(c(0, 1), c("white", "red")), right_annotation = hmSeqlogo) ## ----findMotifs, warning=FALSE------------------------------------------------ # get sequences of promoters as a DNAStringSet # (the `subject` of `findMotifHits` could also be a single DNAString, # or the name of a fasta file) library(TxDb.Mmusculus.UCSC.mm10.knownGene) gr <- trim(promoters(TxDb.Mmusculus.UCSC.mm10.knownGene, upstream = 1000, downstream = 500)[c(1, 4, 5, 10)]) library(BSgenome.Mmusculus.UCSC.mm10) seqs <- getSeq(BSgenome.Mmusculus.UCSC.mm10, gr) seqs # get motifs as a PWMatrixList # (the `query` of `findMotifHits` could also be a single PWMatrix, # or the name of a motif file) library(JASPAR2020) library(TFBSTools) pfms <- getMatrixByID(JASPAR2020, c("MA0885.1", "MA0099.3", "MA0033.2", "MA0037.3", "MA0158.1")) pwms <- toPWM(pfms) pwms name(pwms) # predict hits in sequences res <- findMotifHits(query = pwms, subject = seqs, min.score = 6.0, method = "matchPWM", BPPARAM = BiocParallel::SerialParam()) res # create hit matrix: # number of sites of each motif per sequence m <- table(factor(seqnames(res), levels = names(seqs)), factor(res$pwmname, levels = name(pwms))) m ## ---- session----------------------------------------------------------------- sessionInfo()