params <-
list(step1_ga_results_dir = "run_20260320_133141", seqqual_results_dir = "run_20260320_162359")

## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  echo = FALSE,
  warning = FALSE,
  message = FALSE
)

## ----benchmark-data-loading---------------------------------------------------
suppressPackageStartupMessages({
  library(readr)
  library(dplyr)
  library(tidyr)
  library(ggplot2)
  library(scales)
})

resolve_results_dir <- function(path) {
  input_dir <- tryCatch(dirname(normalizePath(knitr::current_input(), winslash = "/", mustWork = TRUE)), error = function(e) NA_character_)
  wd <- tryCatch(normalizePath(getwd(), winslash = "/", mustWork = TRUE), error = function(e) NA_character_)
  candidates <- c(
    path,
    if (!is.na(input_dir)) file.path(input_dir, path) else NA_character_,
    if (!is.na(input_dir)) file.path(input_dir, "..", path) else NA_character_,
    if (!is.na(input_dir)) file.path(input_dir, "..", "..", path) else NA_character_,
    if (!is.na(input_dir)) file.path(input_dir, "..", "inst", "benchmarks", path) else NA_character_,
    if (!is.na(wd)) file.path(wd, path) else NA_character_,
    if (!is.na(wd)) file.path(wd, "inst", "benchmarks", path) else NA_character_
  )
  candidates <- unique(candidates[!is.na(candidates) & nzchar(candidates)])
  hits <- candidates[file.exists(candidates)]
  if (length(hits) == 0) {
    stop("Could not locate results_dir: ", path, call. = FALSE)
  }
  normalizePath(hits[[1]], winslash = "/", mustWork = TRUE)
}

read_if_exists <- function(path) {
  if (file.exists(path)) readr::read_csv(path, show_col_types = FALSE) else tibble::tibble()
}

load_run <- function(path_label) {
  dir_path <- resolve_results_dir(path_label)
  summary_path <- if (file.exists(file.path(dir_path, "summary_qmd.csv"))) {
    file.path(dir_path, "summary_qmd.csv")
  } else {
    file.path(dir_path, "summary.csv")
  }

  list(
    label = basename(dir_path),
    dir = dir_path,
    summary = read_if_exists(summary_path),
    files = read_if_exists(file.path(dir_path, "files.csv")),
    host = read_if_exists(file.path(dir_path, "host_info.csv")),
    reference_counts = read_if_exists(file.path(dir_path, "reference_counts.csv")),
    correctness = read_if_exists(file.path(dir_path, "correctness_preflight.csv"))
  )
}

run_main <- load_run(params$step1_ga_results_dir)
run_seqqual <- load_run(params$seqqual_results_dir)

main_ok <- run_main$summary %>%
  filter(status == "ok", workload %in% c("step1", "galignments"), comparison_track == "fair")

seq_ok <- run_seqqual$summary %>%
  filter(status == "ok", workload == "seqqual")

single_main <- main_ok %>% filter(scenario == "single")
multi_main <- main_ok %>% filter(scenario == "multi")

seq_single <- seq_ok %>% filter(scenario == "single")
seq_multi <- seq_ok %>% filter(scenario == "multi")

host_main <- if (nrow(run_main$host) > 0) run_main$host[1, ] else tibble::tibble()
host_seq <- if (nrow(run_seqqual$host) > 0) run_seqqual$host[1, ] else tibble::tibble()

## ----benchmark-provenance-table-----------------------------------------------
run_overview_tbl <- tibble::tibble(
  run = c(run_main$label, run_seqqual$label),
  workloads = c("step1, galignments", "seqqual"),
  profile = c(
    if (nrow(run_main$host) > 0) host_main$profile[[1]] else NA_character_,
    if (nrow(run_seqqual$host) > 0) host_seq$profile[[1]] else NA_character_
  ),
  cpu = c(
    if (nrow(run_main$host) > 0) host_main$cpu_model[[1]] else NA_character_,
    if (nrow(run_seqqual$host) > 0) host_seq$cpu_model[[1]] else NA_character_
  ),
  logical_cores = c(
    if (nrow(run_main$host) > 0) host_main$logical_cores[[1]] else NA_integer_,
    if (nrow(run_seqqual$host) > 0) host_seq$logical_cores[[1]] else NA_integer_
  ),
  total_mem_gb = c(
    if (nrow(run_main$host) > 0) round(host_main$total_mem_gb[[1]], 1) else NA_real_,
    if (nrow(run_seqqual$host) > 0) round(host_seq$total_mem_gb[[1]], 1) else NA_real_
  ),
  successful_cases = c(nrow(main_ok), nrow(seq_ok))
)

knitr::kable(run_overview_tbl, col.names = c("Run", "Workloads", "Profile", "CPU", "Logical cores", "RAM (GB)", "Successful cases"))

## ----benchmark-input-files----------------------------------------------------
run_main$files %>%
  transmute(
    file,
    source,
    selected_for_single,
    selected_for_multi,
    size_mb = round(size_mb, 1),
    has_index
  ) %>%
  knitr::kable()

## ----benchmark-reference-counts-----------------------------------------------
bind_rows(
  run_main$reference_counts %>% mutate(run = run_main$label),
  run_seqqual$reference_counts %>% mutate(run = run_seqqual$label)
) %>%
  mutate(total_mb = round(total_mb, 1)) %>%
  select(run, scenario, workload, n_files, n_records, total_mb) %>%
  knitr::kable()

## ----best-observed-summary----------------------------------------------------
best_summary_tbl <- bind_rows(main_ok, seq_ok) %>%
  group_by(scenario, workload, comparison_track, method_family) %>%
  slice_min(order_by = median_s, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    plan = paste0(bp_workers_effective, "x", threads_effective),
    median_s = round(median_s, 3)
  ) %>%
  select(scenario, workload, comparison_track, method_family, method, plan, median_s)

knitr::kable(best_summary_tbl, col.names = c("Scenario", "Workload", "Track", "Method family", "Method", "Plan", "Median time (s)"))

## ----fold-change-summary------------------------------------------------------
best_fold_tbl <- function(df, scenario_name, include_optimized = FALSE) {
  bamscale_fair <- df %>%
    filter(scenario == scenario_name, method_family == "BamScale", comparison_track == "fair") %>%
    group_by(workload) %>%
    slice_min(order_by = median_s, n = 1, with_ties = FALSE) %>%
    ungroup() %>%
    transmute(
      scenario = scenario_name,
      workload,
      track = "fair",
      bamscale_method = method,
      bamscale_plan = paste0(bp_workers_effective, "x", threads_effective),
      bamscale_s = median_s
    ) %>%
    left_join(
      df %>%
        filter(scenario == scenario_name, method_family != "BamScale", comparison_track == "fair") %>%
        group_by(workload) %>%
        slice_min(order_by = median_s, n = 1, with_ties = FALSE) %>%
        ungroup() %>%
        transmute(
          workload,
          comparator_family = method_family,
          comparator_method = method,
          comparator_plan = paste0(bp_workers_effective, "x", threads_effective),
          comparator_s = median_s
        ),
      by = "workload"
    ) %>%
    mutate(
      fold_change = comparator_s / bamscale_s,
      pct_faster = 100 * (1 - bamscale_s / comparator_s)
    )

  if (!include_optimized) {
    return(bamscale_fair)
  }

  bamscale_optimized <- df %>%
    filter(scenario == scenario_name, method_family == "BamScale", comparison_track == "optimized") %>%
    group_by(workload) %>%
    slice_min(order_by = median_s, n = 1, with_ties = FALSE) %>%
    ungroup() %>%
    transmute(
      scenario = scenario_name,
      workload,
      track = "optimized",
      bamscale_method = method,
      bamscale_plan = paste0(bp_workers_effective, "x", threads_effective),
      bamscale_s = median_s
    ) %>%
    left_join(
      df %>%
        filter(scenario == scenario_name, method_family != "BamScale", comparison_track == "fair") %>%
        group_by(workload) %>%
        slice_min(order_by = median_s, n = 1, with_ties = FALSE) %>%
        ungroup() %>%
        transmute(
          workload,
          comparator_family = method_family,
          comparator_method = method,
          comparator_plan = paste0(bp_workers_effective, "x", threads_effective),
          comparator_s = median_s
        ),
      by = "workload"
    ) %>%
    mutate(
      fold_change = comparator_s / bamscale_s,
      pct_faster = 100 * (1 - bamscale_s / comparator_s)
    )

  bind_rows(bamscale_fair, bamscale_optimized)
}

fold_tbl <- bind_rows(
  best_fold_tbl(main_ok, "single"),
  best_fold_tbl(main_ok, "multi"),
  best_fold_tbl(seq_ok, "single", include_optimized = TRUE),
  best_fold_tbl(seq_ok, "multi", include_optimized = TRUE)
) %>%
  mutate(
    across(c(bamscale_s, comparator_s, fold_change, pct_faster), ~ round(.x, 3))
  )

knitr::kable(
  fold_tbl %>%
    select(scenario, workload, track, comparator_family, comparator_method, comparator_plan,
           comparator_s, bamscale_method, bamscale_plan, bamscale_s, fold_change, pct_faster),
  col.names = c("Scenario", "Workload", "Track", "Comparator family", "Comparator method",
                "Comparator plan", "Comparator time (s)", "BamScale method", "BamScale plan",
                "BamScale time (s)", "Comparator/BamScale", "BamScale % faster")
)

## ----fold-change-plot, fig.width=9, fig.height=4.8----------------------------
ggplot(fold_tbl, aes(x = interaction(workload, scenario, track, sep = "\n"), y = fold_change, fill = workload)) +
  geom_col(width = 0.75) +
  geom_text(aes(label = sprintf("%.2fx", fold_change)), vjust = -0.3, size = 3.5) +
  scale_fill_brewer(palette = "Set2") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(
    x = NULL,
    y = "Speedup over comparator (x)",
    fill = "Workload",
    title = "Best-observed BamScale speedup over comparator"
  ) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 18, hjust = 1))

## ----step1-single-plot, fig.width=7.5, fig.height=4.8-------------------------
single_main %>%
  filter(workload == "step1") %>%
  mutate(
    method_label = if_else(method_family == "BamScale", "BamScale", "Rsamtools::scanBam")
  ) %>%
  ggplot(aes(x = threads_effective, y = median_s, color = method_label)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.7) +
  scale_x_continuous(breaks = c(1, 4, 12, 24, 48)) +
  labs(
    x = "Threads",
    y = "Median time (s)",
    color = NULL,
    title = "Single-file step1 scaling"
  ) +
  theme_minimal(base_size = 12)

## ----step1-single-table-------------------------------------------------------
single_main %>%
  filter(workload == "step1") %>%
  mutate(plan = paste0(bp_workers_effective, "x", threads_effective)) %>%
  select(method_family, method, plan, median_s) %>%
  arrange(median_s) %>%
  knitr::kable(col.names = c("Method family", "Method", "Plan", "Median time (s)"))

## ----galignments-single-plot, fig.width=7.5, fig.height=4.8-------------------
single_main %>%
  filter(workload == "galignments") %>%
  mutate(
    method_label = if_else(method_family == "BamScale", "BamScale", "GenomicAlignments::readGAlignments")
  ) %>%
  ggplot(aes(x = threads_effective, y = median_s, color = method_label)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.7) +
  scale_x_continuous(breaks = c(1, 4, 12, 24, 48)) +
  labs(
    x = "Threads",
    y = "Median time (s)",
    color = NULL,
    title = "Single-file galignments scaling"
  ) +
  theme_minimal(base_size = 12)

## ----galignments-single-table-------------------------------------------------
single_main %>%
  filter(workload == "galignments") %>%
  mutate(plan = paste0(bp_workers_effective, "x", threads_effective)) %>%
  select(method_family, method, plan, median_s) %>%
  arrange(median_s) %>%
  knitr::kable(col.names = c("Method family", "Method", "Plan", "Median time (s)"))

## ----step1-multi-plot, fig.width=7.5, fig.height=4.8--------------------------
multi_main %>%
  filter(workload == "step1") %>%
  mutate(
    method_label = if_else(method_family == "BamScale", "BamScale", "Rsamtools::scanBam + BiocParallel")
  ) %>%
  ggplot(aes(x = bp_workers_effective, y = median_s, color = method_label, group = method_label)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.7) +
  scale_x_continuous(breaks = c(1, 2, 4, 8, 12)) +
  labs(
    x = "Workers",
    y = "Median time (s)",
    color = NULL,
    title = "Multi-file step1 scaling"
  ) +
  theme_minimal(base_size = 12)

## ----galignments-multi-plot, fig.width=7.5, fig.height=4.8--------------------
multi_main %>%
  filter(workload == "galignments") %>%
  mutate(
    method_label = if_else(method_family == "BamScale", "BamScale", "GenomicAlignments::readGAlignments + BiocParallel")
  ) %>%
  ggplot(aes(x = bp_workers_effective, y = median_s, color = method_label, group = method_label)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.7) +
  scale_x_continuous(breaks = c(1, 2, 4, 8, 12)) +
  labs(
    x = "Workers",
    y = "Median time (s)",
    color = NULL,
    title = "Multi-file galignments scaling"
  ) +
  theme_minimal(base_size = 12)

## ----seqqual-single-plot, fig.width=8, fig.height=4.8-------------------------
seq_single %>%
  mutate(
    method_label = case_when(
      method_family == "BamScale" & comparison_track == "fair" ~ "BamScale (compatible)",
      method_family == "BamScale" & comparison_track == "optimized" ~ "BamScale (compact)",
      TRUE ~ "Rsamtools::scanBam"
    )
  ) %>%
  ggplot(aes(x = threads_effective, y = median_s, color = method_label)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.7) +
  scale_x_continuous(breaks = c(1, 4, 12, 24, 48)) +
  labs(
    x = "Threads",
    y = "Median time (s)",
    color = NULL,
    title = "Single-file seqqual scaling"
  ) +
  theme_minimal(base_size = 12)

## ----seqqual-single-table-----------------------------------------------------
seq_single %>%
  mutate(
    method_label = case_when(
      method_family == "BamScale" & comparison_track == "fair" ~ "BamScale (compatible)",
      method_family == "BamScale" & comparison_track == "optimized" ~ "BamScale (compact)",
      TRUE ~ "Rsamtools::scanBam"
    ),
    plan = paste0(bp_workers_effective, "x", threads_effective)
  ) %>%
  select(method_label, plan, median_s) %>%
  arrange(median_s) %>%
  knitr::kable(col.names = c("Method", "Plan", "Median time (s)"))

## ----seqqual-multi-plot, fig.width=8, fig.height=4.8--------------------------
seq_multi %>%
  mutate(
    method_label = case_when(
      method_family == "BamScale" & comparison_track == "fair" ~ "BamScale (compatible)",
      method_family == "BamScale" & comparison_track == "optimized" ~ "BamScale (compact)",
      TRUE ~ "Rsamtools::scanBam + BiocParallel"
    )
  ) %>%
  ggplot(aes(x = bp_workers_effective, y = median_s, color = method_label, group = method_label)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.7) +
  scale_x_continuous(breaks = c(1, 2, 4, 8, 12)) +
  labs(
    x = "Workers",
    y = "Median time (s)",
    color = NULL,
    title = "Multi-file seqqual scaling"
  ) +
  theme_minimal(base_size = 12)

## ----compact-gain-table-------------------------------------------------------
compact_gain_tbl <- bind_rows(seq_single, seq_multi) %>%
  filter(method_family == "BamScale") %>%
  select(scenario, workload, comparison_track, bp_workers_effective, threads_effective, median_s) %>%
  tidyr::pivot_wider(names_from = comparison_track, values_from = median_s) %>%
  mutate(
    plan = paste0(bp_workers_effective, "x", threads_effective),
    compact_gain = fair / optimized
  ) %>%
  arrange(scenario, bp_workers_effective, threads_effective)

knitr::kable(
  compact_gain_tbl %>%
    transmute(
      scenario,
      workload,
      plan,
      compatible_s = round(fair, 3),
      compact_s = round(optimized, 3),
      compact_gain = round(compact_gain, 3)
    ),
  col.names = c("Scenario", "Workload", "Plan", "Compatible time (s)", "Compact time (s)", "Compatible/Compact")
)

## ----compact-gain-plot, fig.width=7.5, fig.height=4.8-------------------------
compact_gain_tbl %>%
  ggplot(aes(x = plan, y = compact_gain, fill = scenario)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  geom_text(
    aes(label = sprintf("%.2fx", compact_gain)),
    position = position_dodge(width = 0.8),
    vjust = -0.2,
    size = 3.3
  ) +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1))) +
  labs(
    x = "BamScale plan",
    y = "Compatible / compact",
    fill = "Scenario",
    title = "Compact seqqual reduces extraction overhead"
  ) +
  theme_minimal(base_size = 12)

## ----benchmark-session-info---------------------------------------------------
sessionInfo()