## ----setup, echo=FALSE, message=FALSE, warning=FALSE--------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", fig.pos = "H")
suppressPackageStartupMessages({
  library(universalmotif)
  library(Biostrings)
  library(GenomicRanges)
  library(GenomeInfoDb)
})
options(Biostrings.coloring = FALSE)

has_pkgs <- suppressWarnings(suppressMessages(vapply(
  c("rtracklayer", "BSgenome.Athaliana.TAIR.TAIR9", "MotifDb"),
  requireNamespace, logical(1), quietly = TRUE)))
if (!all(has_pkgs)) {
  knitr::opts_chunk$set(eval = FALSE)
  message("Skipping ChIP-seq vignette: required Suggests packages ",
          "are not installed (",
          paste(names(has_pkgs)[!has_pkgs], collapse = ", "),
          ").")
}

## -----------------------------------------------------------------------------
library(rtracklayer)

bed.file <- system.file("extdata",
                        "VAL1-GFP_val1_calledPeaks.bed",
                        package = "universalmotif")
peaks <- import(bed.file, format = "BED")
peaks

## -----------------------------------------------------------------------------
seqlevels(peaks) <- paste0("Chr", seqlevels(peaks))
peaks <- keepSeqlevels(peaks,
                       paste0("Chr", 1:5),
                       pruning.mode = "coarse")
summary(width(peaks))
length(peaks)

## -----------------------------------------------------------------------------
library(BSgenome.Athaliana.TAIR.TAIR9)

peaks.500 <- resize(peaks, width = 500, fix = "center")
## Drop any windows that ran off a chromosome end.
peaks.500 <- trim(peaks.500)
peaks.500 <- peaks.500[width(peaks.500) == 500]

peak.seqs <- getSeq(Athaliana, peaks.500)
names(peak.seqs) <- paste0("peak_", seq_along(peak.seqs))
peak.seqs

## -----------------------------------------------------------------------------
set.seed(2026)
bkg.seqs <- match_bkg(peak.seqs, genome = Athaliana, exclude = peaks.500)
length(bkg.seqs)

## -----------------------------------------------------------------------------
summary(letterFrequency(peak.seqs, "GC", as.prob = TRUE))
summary(letterFrequency(bkg.seqs,  "GC", as.prob = TRUE))

## ----motif-finder, cache=FALSE------------------------------------------------
discovered.df <- motif_finder(peak.seqs,
                              bkg.sequences = bkg.seqs,
                              nmotifs       = 5,
                              min.width     = 6,
                              max.width     = 12,
                              rng.seed      = 2026,
                              nthreads      = 2)
print(as.data.frame(discovered.df[, c("name", "consensus", "pval")]),
      row.names = FALSE)

## -----------------------------------------------------------------------------
discovered <- to_list(discovered.df, extrainfo = FALSE)

## ----fig.height=6, fig.width=7------------------------------------------------
discovered.labelled <- discovered
for (i in seq_along(discovered.labelled)) {
  discovered.labelled[[i]]["name"] <- sprintf(
    "%s  (p = %.1e)",
    discovered.labelled[[i]]["name"],
    discovered.df$pval[i]
  )
}
view_motifs2(discovered.labelled, sort.by = "similarity",
             show.positions = FALSE, names.pos = "right")

## -----------------------------------------------------------------------------
discovered.merged <- merge_similar2(discovered, qvalue = 0.01)
discovered.merged <- trim_motifs(discovered.merged)

length(discovered)
length(discovered.merged)

## -----------------------------------------------------------------------------
label_merged <- function(consensus) {
  if (grepl("CATGCA", consensus))                    "RY"
  else if (grepl("CACGTG", consensus))               "G_box"
  else if (grepl("ACGTAC|GTACGT", consensus))        "bZIP"
  else if (grepl("AAAAAA|ATATAT|AAATAA", consensus)) "AT_rich"
  else if (grepl("GAGAGA", consensus))               "GA_repeat"
  else                                               consensus
}
new.names <- vapply(discovered.merged,
                    function(m) m["name"], character(1))
for (i in seq_along(discovered.merged)) {
  new.names[i] <- label_merged(
    discovered.merged[[i]]["consensus"]
  )
  discovered.merged[[i]]["name"] <- new.names[i]
}
new.names

## ----fig.height=4, fig.width=7------------------------------------------------
view_motifs2(discovered.merged, sort.by = "similarity",
             show.positions = FALSE, names.pos = "right")

## ----motifdb, warning=FALSE, message=FALSE------------------------------------
suppressPackageStartupMessages(library(MotifDb))

ath.jaspar <- query(query(MotifDb, "Athaliana"), "jaspar2022")
length(ath.jaspar)
ath.motifs <- convert_motifs(ath.jaspar)

## -----------------------------------------------------------------------------
combined <- c(discovered.merged, ath.motifs)
n.disc   <- length(discovered.merged)

cmp <- compare_motifs2(combined,
                       compare.to = seq_len(n.disc),
                       qvalue     = 0.2,
                       nthreads   = 2)
cmp <- cmp[cmp$subject != cmp$target, ]
cmp <- cmp[order(cmp$qvalue), ]
print(head(cmp[, c("subject", "target", "score", "qvalue",
                   "subject.consensus", "target.consensus")], 10),
      row.names = FALSE)

## ----fig.height=2.5, fig.width=7----------------------------------------------
ref.names <- vapply(ath.motifs, function(m) m["name"], character(1))
ry.cmp <- cmp[cmp$subject == "RY", ]
ry.best <- ry.cmp[order(ry.cmp$qvalue), ][1, ]

top.disc <- discovered.merged[[match("RY", new.names)]]
top.ref  <- ath.motifs[[match(ry.best$target, ref.names)]]
view_motifs2(c(top.disc, top.ref), show.positions.once = FALSE,
             names.pos = "right")

## -----------------------------------------------------------------------------
enr <- enrich_motifs2(motifs        = discovered.merged,
                      sequences     = peak.seqs,
                      bkg.sequences = bkg.seqs,
                      pvalue        = 1e-4,
                      qvalue        = 0.05,
                      nthreads      = 2)
print(enr[, c("motif", "consensus", "enrichment", "qvalue")],
      row.names = FALSE)

## -----------------------------------------------------------------------------
hits <- scan_sequences2(discovered.merged,
                        peak.seqs,
                        pvalue         = 1e-4,
                        return.granges = FALSE,
                        nthreads       = 2)
nrow(hits)
table(motif = hits$motif)

## -----------------------------------------------------------------------------
pk <- motif_peaks(hits, seq.length = 500, mode = "central")
print(pk[, c("motif", "best.center", "best.window", "qvalue")],
      row.names = FALSE)

## ----fig.height=3.5, fig.width=7----------------------------------------------
plot_motif_peaks(pk)

## -----------------------------------------------------------------------------
co <- motif_coocc(discovered.merged,
                  hits         = hits,
                  n.sequences  = length(peak.seqs),
                  max.distance = 50L,
                  self.pairs   = TRUE)
print(co[, c("motif_a", "motif_b", "both", "both.clustered",
             "median.distance", "qvalue")],
      row.names = FALSE)

## ----fig.height=3, fig.width=7------------------------------------------------
ry.hits  <- hits[hits$motif == "RY", ]
ry.dists <- unlist(lapply(split(ry.hits$start, ry.hits$sequence.i),
                          function(s) if (length(s) < 2L)
                            integer(0) else as.integer(dist(s))))
length(ry.dists)
ggplot2::ggplot(data.frame(distance = ry.dists),
                ggplot2::aes(x = .data$distance)) +
  ggplot2::geom_histogram(binwidth = 5, fill = "black", colour = NA) +
  ggplot2::geom_vline(xintercept = 50, linetype = "dashed",
                      colour = "grey50") +
  ggplot2::labs(x = "RY-RY distance (bp)", y = "Count") +
  ggplot2::theme_bw() +
  ggplot2::theme(panel.grid         = ggplot2::element_blank(),
                 panel.border       = ggplot2::element_blank(),
                 axis.line.x.bottom = ggplot2::element_line(colour = "black"),
                 axis.line.y.left   = ggplot2::element_line(colour = "black"))

## ----fig.height=5, fig.width=7------------------------------------------------
ry.by.seq <- split(ry.hits$start, ry.hits$sequence.i)
other.motifs <- setdiff(unique(hits$motif), "RY")
cross.dists <- do.call(rbind, lapply(other.motifs, function(om) {
  om.hits   <- hits[hits$motif == om, ]
  om.by.seq <- split(om.hits$start, om.hits$sequence.i)
  shared    <- intersect(names(ry.by.seq), names(om.by.seq))
  d <- unlist(lapply(shared, function(s)
    abs(as.integer(outer(ry.by.seq[[s]], om.by.seq[[s]], "-")))))
  if (!length(d)) return(NULL)
  data.frame(other = om, distance = d, stringsAsFactors = FALSE)
}))
ggplot2::ggplot(cross.dists, ggplot2::aes(x = .data$distance)) +
  ggplot2::geom_histogram(binwidth = 10, fill = "black", colour = NA) +
  ggplot2::geom_vline(xintercept = 50, linetype = "dashed",
                      colour = "grey50") +
  ggplot2::facet_wrap(~ other, scales = "free_y") +
  ggplot2::labs(x = "RY-to-other distance (bp)", y = "Count") +
  ggplot2::theme_bw() +
  ggplot2::theme(panel.grid         = ggplot2::element_blank(),
                 panel.border       = ggplot2::element_blank(),
                 strip.background   = ggplot2::element_rect(fill = NA,
                                                            colour = NA),
                 axis.line.x.bottom = ggplot2::element_line(colour = "black"),
                 axis.line.y.left   = ggplot2::element_line(colour = "black"))

## -----------------------------------------------------------------------------
disc.names <- new.names

best.match <- vapply(disc.names, function(nm) {
  i <- which(cmp$subject == nm)
  if (!length(i)) NA_character_ else cmp$target[i[1]]
}, character(1))

enrich.q <- vapply(disc.names, function(nm) {
  i <- which(enr$motif == nm)
  if (!length(i)) NA_real_ else enr$qvalue[i[1]]
}, numeric(1))

pos.q <- vapply(disc.names, function(nm) {
  i <- which(pk$motif == nm)
  if (!length(i)) NA_real_ else pk$qvalue[i[1]]
}, numeric(1))

best.partner <- vapply(disc.names, function(nm) {
  ix <- which((co$motif_a == nm | co$motif_b == nm) &
              co$motif_a != co$motif_b)
  if (!length(ix)) return(NA_character_)
  ix <- ix[which.min(co$qvalue[ix])]
  if (co$motif_a[ix] == nm) co$motif_b[ix] else co$motif_a[ix]
}, character(1))

summary.tbl <- data.frame(
  motif         = disc.names,
  jaspar.match  = best.match,
  enrich.q      = signif(enrich.q, 3),
  positional.q  = signif(pos.q, 3),
  best.partner  = best.partner,
  stringsAsFactors = FALSE
)
rownames(summary.tbl) <- NULL
print(summary.tbl, row.names = FALSE)

## -----------------------------------------------------------------------------
sessionInfo()