## ----setup, echo=FALSE, message=FALSE, warning=FALSE--------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", fig.pos = "H")
suppressPackageStartupMessages({
  library(universalmotif)
  library(ggplot2)
})

has_pkgs <- suppressWarnings(suppressMessages(vapply(
  c("MotifDb", "ape", "ggtree"),
  requireNamespace, logical(1), quietly = TRUE)))
if (!all(has_pkgs)) {
  knitr::opts_chunk$set(eval = FALSE)
  message("Skipping curated-DB vignette: required Suggests packages ",
          "are not installed (",
          paste(names(has_pkgs)[!has_pkgs], collapse = ", "),
          ").")
}

## ----message=FALSE, warning=FALSE---------------------------------------------
suppressPackageStartupMessages(library(MotifDb))

hs <- suppressWarnings(query(MotifDb, "hsapiens"))
sources <- c("jaspar2022", "HOCOMOCOv11-core-A", "cisbp_1.02")
per.source <- suppressWarnings(lapply(sources, function(s)
  convert_motifs(query(hs, s))))
names(per.source) <- sources
sapply(per.source, length)

## -----------------------------------------------------------------------------
for (s in sources) {
  for (i in seq_along(per.source[[s]]))
    per.source[[s]][[i]]["family"] <- s
}
all.motifs <- do.call(c, per.source)
length(all.motifs)

## ----fig.height=2.5, fig.width=6----------------------------------------------
src.df <- data.frame(source = sources,
                     count  = sapply(per.source, length),
                     stringsAsFactors = FALSE)
ggplot(src.df, aes(x = .data$source, y = .data$count)) +
  geom_col(fill = "black") +
  geom_text(aes(label = .data$count), vjust = -0.4) +
  labs(x = NULL, y = "Number of motifs") +
  theme_bw() +
  theme(panel.grid         = element_blank(),
        panel.border       = element_blank(),
        axis.line.x.bottom = element_line(colour = "black"),
        axis.line.y.left   = element_line(colour = "black"))

## ----eval=FALSE---------------------------------------------------------------
# jaspar.motifs   <- read_jaspar("JASPAR2024_CORE_vertebrates.txt")
# hocomoco.motifs <- read_meme("HOCOMOCOv11_core_HUMAN_mono_meme.txt")
# cisbp.motifs    <- read_cisbp("CIS-BP_2.00_Homo_sapiens.txt")
# transfac.motifs <- read_transfac("transfac_matrix.dat")
# uniprobe.motifs <- read_uniprobe("uniprobe_results.txt")
# all.motifs      <- c(jaspar.motifs, hocomoco.motifs, cisbp.motifs,
#                      transfac.motifs, uniprobe.motifs)

## -----------------------------------------------------------------------------
qmat <- compare_motifs2(all.motifs,
                        matrix.out = "qvalue",
                        nthreads   = 2)
dim(qmat)
qvals <- qmat[upper.tri(qmat)]
length(qvals)
summary(qvals)

## ----fig.height=3, fig.width=6------------------------------------------------
log10q <- log10(pmax(qvals, .Machine$double.xmin))
ggplot(data.frame(log10q = log10q), aes(x = .data$log10q)) +
  geom_histogram(binwidth = 0.5, fill = "black", colour = NA) +
  geom_vline(xintercept = log10(1e-6), linetype = "dashed",
             colour = "red") +
  labs(x = expression(log[10] * "(q-value) of pairwise comparison"),
       y = "Number of pairs") +
  theme_bw() +
  theme(panel.grid         = element_blank(),
        panel.border       = element_blank(),
        axis.line.x.bottom = element_line(colour = "black"),
        axis.line.y.left   = element_line(colour = "black"))

## -----------------------------------------------------------------------------
mean(qvals < 1e-6)
sum(qvals < 1e-6)

## -----------------------------------------------------------------------------
clusters <- merge_similar2(all.motifs,
                           qvalue          = 1e-6,
                           return.clusters = TRUE,
                           nthreads        = 2)

## ----fig.height=6, fig.width=7------------------------------------------------
big.cluster <- names(sort(table(clusters$cluster),
  decreasing = TRUE)[1])
big.cluster.motifs <- all.motifs[
  clusters$motif.i[clusters$cluster == big.cluster]]
if (length(big.cluster.motifs) > 10)
  big.cluster.motifs <- big.cluster.motifs[1:10]

view_motifs2(big.cluster.motifs,
  names.pos = "right", show.positions = FALSE)

## -----------------------------------------------------------------------------
merged <- merge_similar2(all.motifs,
                         qvalue   = 1e-6,
                         nthreads = 2)
length(all.motifs)
length(merged)

## -----------------------------------------------------------------------------
contrib.names <- tapply(clusters$name, clusters$cluster,
                        function(n) as.character(n))
for (i in seq_along(merged)) {
  nms <- contrib.names[[i]]
  if (length(nms) == 2L)
    merged[[i]]["name"] <- paste(nms, collapse = "+")
  else if (length(nms) >= 3L)
    merged[[i]]["name"] <- sprintf("%s+%s+%dmore",
                                   nms[1], nms[2], length(nms) - 2L)
}
## `clusters` is a universalmotif_df: one row per input motif, with
## the motif objects in the `motif` column plus the `motif.i` and
## `cluster` bookkeeping columns. Show just the assignment columns:
print(head(as.data.frame(clusters)[, c("name", "motif.i", "cluster")], 8),
      row.names = FALSE)

## ----fig.height=3, fig.width=6------------------------------------------------
cluster.sizes <- table(clusters$cluster)
size.dist     <- as.data.frame(table(cluster.sizes),
                               stringsAsFactors = FALSE)
names(size.dist) <- c("cluster.size", "n.clusters")
size.dist$cluster.size <- as.integer(size.dist$cluster.size)
ggplot(size.dist, aes(x = factor(.data$cluster.size),
                      y = .data$n.clusters)) +
  geom_col(fill = "black") +
  geom_text(aes(label = .data$n.clusters), vjust = -0.4, size = 3) +
  labs(x = "Cluster size (number of input motifs)",
       y = "Number of clusters") +
  theme_bw() +
  theme(panel.grid         = element_blank(),
        panel.border       = element_blank(),
        axis.line.x.bottom = element_line(colour = "black"),
        axis.line.y.left   = element_line(colour = "black"))

## -----------------------------------------------------------------------------
clusters$source <- vapply(all.motifs[clusters$motif.i],
                          function(m) m["family"], character(1))
sources.per.cluster <- tapply(clusters$source, clusters$cluster,
                              function(s) length(unique(s)))
table(sources.per.cluster)

## ----fig.height=6, fig.width=6, warning=FALSE, message=FALSE------------------
set.seed(2026)
sub.idx <- sort(sample(length(all.motifs), 200))
sub     <- all.motifs[sub.idx]
clean.names <- make.unique(gsub("[^A-Za-z0-9_]", "_",
                                vapply(sub, function(m) m["name"],
                                       character(1))))
for (i in seq_along(sub)) sub[[i]]["name"] <- clean.names[i]
motif_tree2(sub, linecol = "family", labels = "none",
            layout = "circular", progress = FALSE) +
  ggplot2::scale_color_discrete(breaks = sources) +
  ggplot2::theme(legend.position = "bottom",
                 legend.title    = ggplot2::element_blank())

## ----fig.height=2.5, fig.width=7----------------------------------------------
trim.delta <- vapply(merged, function(m)
  ncol(m["motif"]) - ncol(trim_motifs(m)["motif"]),
  integer(1))
trim.candidate.idx <- which.max(trim.delta)
trim.before <- merged[[trim.candidate.idx]]
trim.before["name"] <- paste0(trim.before["name"], "  (untrimmed)")
trim.after  <- trim_motifs(merged[[trim.candidate.idx]])
trim.after["name"] <- paste0(trim.after["name"], "  (trimmed)")
view_motifs2(list(trim.before, trim.after), names.pos = "right")

## -----------------------------------------------------------------------------
curated <- trim_motifs(merged)
range(sapply(merged,  function(m) ncol(m["motif"])))
range(sapply(curated, function(m) ncol(m["motif"])))

## -----------------------------------------------------------------------------
src.by.cluster <- tapply(clusters$source, clusters$cluster,
                         function(s)
                           paste(sort(unique(s)), collapse = ";"))
n.by.cluster   <- tapply(clusters$source, clusters$cluster,
                         length)

curated.df <- to_df(curated, extrainfo = TRUE)
curated.df$dataSource    <- NULL  # MotifDb's single-source field;
                                  # superseded by the multi-source
                                  # db.sources column we are about
                                  # to add
curated.df$db.sources    <- as.character(src.by.cluster)
curated.df$n.contributors <- as.integer(n.by.cluster)
curated.df$curation.date <- "2026-05-25"

print(as.data.frame(head(curated.df[, c("name", "db.sources",
                                        "n.contributors",
                                        "curation.date")], 10)),
      row.names = FALSE)

## -----------------------------------------------------------------------------
curated <- to_list(update_motifs(curated.df), extrainfo = TRUE)
curated[[1]]["extrainfo"]

## -----------------------------------------------------------------------------
out.file <- tempfile(fileext = ".meme")
write_meme(curated, file = out.file, overwrite = TRUE)
file.info(out.file)$size

## -----------------------------------------------------------------------------
cat(head(readLines(out.file), 25), sep = "\n")

## -----------------------------------------------------------------------------
re.in <- read_meme(out.file)
length(re.in)
length(curated)
identical(sapply(re.in,   function(m) m["name"]),
          sapply(curated, function(m) m["name"]))

## -----------------------------------------------------------------------------
sessionInfo()