## ----global-setup, include = FALSE--------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    fig.width = 10,
    fig.height = 6.5,
    echo = TRUE,
    eval = TRUE,
    results = 'hide',
    message = TRUE,
    warning = TRUE
)
suppressPackageStartupMessages(library(kableExtra))

## ----install-bioconda, eval = FALSE-------------------------------------------
# # Install from Bioconda
# # conda install -c bioconda r-tsenat

## ----install-github, eval = FALSE---------------------------------------------
# remotes::install_github("gallardoalba/TSENAT")

## ----quick-start-example, eval = FALSE----------------------------------------
# library(TSENAT)
# library(SummarizedExperiment)
# 
# # Load example data
# data("readcounts", package = "TSENAT")
# 
# # Load readcounts object, which contains Salmon-quantified
# # fragment counts with EM-based ambiguity resolution applied
# readcounts <- as.matrix(readcounts)
# 
# # Load sample metadata and annotation
# metadata_df <- read.table(
#   system.file("extdata", "metadata.tsv", package = "TSENAT"),
#   header = TRUE, sep = "\t"
# )
# gff3_file <- system.file("extdata", "annotation.gff3.gz", package = "TSENAT")
# 
# # Create analysis object with transcript counts, annotation, and metadata
# config <- TSENAT_config(
#   sample_col = "sample",
#   condition_col = "condition",
#   paired = TRUE,
#   subject_col = "paired_samples",
#   control = "normal",
#   q = seq(0, 2, by = 0.05)
# )
# 
# ## Build TSENATAnalysis object
# analysis <- build_analysis(
#   readcounts = readcounts,
#   tx2gene = gff3_file,
#   metadata = metadata_df,
#   config = config,
#   tpm = tpm,
#   effective_length = effective_length)
# 
# # Filter low-abundance transcripts
# analysis <- filter_analysis(analysis)
# 
# # Compute Tsallis entropy using S4 wrapper (using single q value for quick start)
# analysis <- calculate_diversity(analysis)
# 
# # Plot overall q-curve
# p_qcurve <- plot_diversity_spectrum(
#     analysis,
#     dev_width = 12,
#     dev_height = 8)
# print(p_qcurve)

## ----workflow-setup-----------------------------------------------------------
# Load packages
suppressPackageStartupMessages({
    library(TSENAT)
    library(ggplot2)
    library(SummarizedExperiment)
})

# Setup random seed for reproducibility
set.seed(42)


## ----prepare-example-data-----------------------------------------------------
# Load example dataset (lazy-loaded as 'readcounts' by default)
# This includes SALMON preprocessing outputs:
# - readcounts: Salmon NumReads (raw fragment counts with EM-based ambiguity resolution)
# - tpm: Salmon TPM matrix (length and library-normalized expression)
# - effective_length: Salmon EffectiveLength vector (accounts for fragment length distribution)
data(readcounts)

readcounts <- as.matrix(readcounts)

metadata_df <- read.table(
    system.file("extdata", "metadata.tsv", package = "TSENAT"),
    header = TRUE, sep = "\t"
)

gff3_file <- system.file("extdata", "annotation.gff3.gz", package = "TSENAT")

## ----initialize-analysis-object-----------------------------------------------
## Configure analysis parameters first (best practice: fail-fast principle)
## This validates all parameters before object creation
config <- TSENAT_config(
  sample_col = "sample",
  condition_col = "condition",
  subject_col = "paired_samples",
  q = seq(0, 2, by = 0.05),
  nthreads = 2,
  paired = TRUE,
  control = "normal"
)

## Build a complete `TSENATAnalysis` object 
analysis <- build_analysis(
  config = config,
  readcounts = readcounts,
  metadata = metadata_df,
  tx2gene = gff3_file, 
  tpm = tpm,
  effective_length = effective_length
)


## ----filter-validate-main-----------------------------------------------------
# Filter lowly-expressed transcripts
analysis <- filter_analysis(analysis, stringency = "medium")

## ----tsallis-entropy-computation, message = FALSE, warning = FALSE------------
# Set random seed for reproducible bootstrap CI calculations
set.seed(12345)

# Compute diversity
analysis <- calculate_diversity(
    analysis,
    norm = TRUE
)

# Extract and display diversity results
diversity <- results(analysis,
    type = "diversity",
    n_genes = 4,
    sample = "SRR14800481")

print(diversity)

## ----kable-tsallis-calc-sequence, echo = FALSE, results = "asis"--------------
# Extract diversity results for three specific q values using the results() accessor
# Show entropy values for ONE sample across three q values (0.05, 1, 2)
# Note: q=0 (richness) is computed deterministically outside bootstrap to avoid discrete data artifacts

q_values_to_show <- c(0, 0.5, 1, 1.5, 2)
sample_data <- NULL
first_sample_name <- NULL

for (q_val in q_values_to_show) {
    # Use tryCatch to gracefully handle q-values that don't exist
    div_q <- tryCatch({
        results(analysis, type = "diversity", q = q_val, format = "se")
    }, error = function(e) {
        NULL
    })
    
    if (!is.null(div_q)) {
        # Extract matrix from SummarizedExperiment
        if (is(div_q, "SummarizedExperiment")) {
            mat <- assay(div_q)
        } else if (is.matrix(div_q)) {
            mat <- div_q
        } else {
            next
        }
        
        # Get first 4 genes and first sample only
        first_sample_idx <- 1
        first_sample_name <- colnames(mat)[first_sample_idx]
        mat_subset <- mat[1:min(4, nrow(mat)), first_sample_idx, drop = FALSE]
        
        # Add values for this q with descriptive column names
        if (is.null(sample_data)) {
            sample_data <- data.frame(Gene = rownames(mat_subset))
        }
        
        col_name <- paste0("q=", sprintf("%.1f", q_val))
        sample_data[[col_name]] <- as.numeric(mat_subset)
    }
}

if (!is.null(sample_data) && nrow(sample_data) > 0) {
    knitr::kable(
        sample_data,
        caption = paste0("**Table 1:** Tsallis entropy for first 4 genes across three diversity scales (sample: ", 
                         first_sample_name, ")"),
        digits = 5,
        row.names = FALSE
    )
} else {
  cat("Note: Diversity results not available for the requested q-values. ",
      "Ensure calculate_diversity() was run with appropriate q-value configuration.")
}

## ----fig-1-isoform-diversity-profiles, fig.width=10, fig.height=5.2, out.width="98%", fig.pos="H", fig.cap = "**Figure 1:** Isoform diversity profiles across entropic indices. Lines show normalized Tsallis entropy (0-1) for each sample, blue control and red treatment."----
# Plot overall q-curve
p_qcurve <- plot_diversity_spectrum(analysis)
print(p_qcurve)

## ----qc-entropy-outlier-detection, echo=TRUE----------------------------------
# Perform multi-q sample influence analysis via S4 wrapper
analysis <- calculate_m_estimator(
    analysis,
    loss_type = "huber",
    q_combine_method = "mean",
    influence_threshold = 0.75
)

# Extract results from metadata using accessor function
sample_qc <- metadata(analysis, "m_estimate_results")
print(sample_qc)

## ----display-qc-outlier-results, echo=FALSE, results="asis"-------------------
if (!is.null(sample_qc) && nrow(sample_qc) > 0) {
  # Round numeric columns for better readability
  sample_qc$Proportion_Affected <- round(sample_qc$Proportion_Affected, 3)
  sample_qc$Distance_from_Centroid <- round(sample_qc$Distance_from_Centroid, 4)
  
  # Select columns: exclude Robustness_Weight and Pair_ID
  cols_to_show <- c("Sample", "Condition", "Proportion_Affected", 
                    "Distance_from_Centroid", "Status")
  cols_to_show <- cols_to_show[cols_to_show %in% colnames(sample_qc)]
  
  knitr::kable(
      sample_qc[, cols_to_show],
      caption = "**Table 2 | Sample Influence Assessment via M-estimation.** Ranked by magnitude of resampling-based influence values. Columns: Sample identifier; condition; proportion affected; distance from centroid; outlier status. M-estimation identifies influential samples for sensitivity analysis.",
      digits = 4,
      row.names = FALSE
  )
}

## ----test-q-condition-interaction---------------------------------------------
# Scale-adaptive interaction test across q values using S4 wrapper
analysis <- calculate_sait(
    analysis,
    method = "gam",
    multicorr = "hochberg"
)
       
# Extract SAIT interaction results
sait_results <- results(analysis, type = "sait", rankBy = "pvalue", n = 10)

print(sait_results)

## ----display-interaction-results, echo = FALSE, results="asis"----------------
sait_res <- results(analysis, type = "sait")
if (!is.null(sait_res) && is.data.frame(sait_res) && nrow(sait_res) > 0) {
    display <- head(sait_res, 6)
    
    # Select only most informative columns
    key_cols <- c("gene", "p_interaction", "adj_p_interaction", 
                  "effect_size", "test_statistic", "model_converged", "heteroscedasticity_detected")
    key_cols <- key_cols[key_cols %in% colnames(display)]
    display <- display[, key_cols]
    
    # Round numeric columns (but NOT p-value columns to preserve significance)
    numeric_cols <- sapply(display, is.numeric)
    pval_cols <- grepl("^p_|^adj_p_", colnames(display))
    display[, numeric_cols & !pval_cols] <- round(display[, numeric_cols & !pval_cols], 4)
    
    # Format p-values to show more precision (scientific notation for very small values)
    for (col in colnames(display)) {
        if (is.numeric(display[[col]]) && grepl("^p_|^adj_p_", col)) {
            display[[col]] <- format(display[[col]], scientific = TRUE, digits = 3)
        }
    }
    
    knitr::kable(display, 
                caption = "**Table 3 | Leading genes with scale-dependent condition effects from generalized additive models.** We display the 6 genes with lowest adjusted p-values (*q* < 0.05). Benjamini-Hochberg correction applied; columns show gene identifier, effect size, test statistic, convergence status, and heteroscedasticity detection. Methods: GAMs with *q* and condition as smooth predictors [@S190].",
                col.names = c("Gene", "P-value", "Adj. P-value", "Effect Size", "Test Statistic", "Model Converged", "Heteroscedasticity"),
                digits = 4, row.names = FALSE)
}

## ----fig-2-scale-dependent-genes, fig.width=14, fig.height=12, out.width="98%", fig.pos="H", fig.cap = "**Figure 2:** Scale-dependent interaction analysis. GAM-identified genes showing significant q$\times$condition effects (Benjamini-Hochberg q < 0.05)."----
# Plot q-curve profiles for the top 4 genes using the S4 wrapper
combined_plot <- plot_sait(
    analysis,
    n_top = 4
)

print(combined_plot)

## ----identify-isoform-switching-transcripts, echo=TRUE------------------------
# Multi-Q analysis: Test switching at different q values
analysis <- calculate_jis(
    analysis,
    q = c(0, 0.5, 1, 1.5, 2),
    nboot = 100,
    lm_p_threshold = 0.05,
    threshold = 90
)

## ----extract-switching-results, echo=TRUE-------------------------------------
# Retrieve gene switching tables
tables_result <- results(analysis, type = "switching_tables")

## ----display-switching-table, echo=FALSE, results='asis'----------------------
# Only display tables if tables_result was successfully created
if (exists("tables_result") && is.list(tables_result) && 
    !is.null(tables_result$gene_headers) && length(tables_result$gene_headers) > 0) {
  # Iterate over the prepared comparison tables and display top 2 genes
  for (gene_idx in 1:min(2, length(tables_result$gene_headers))) {
    # Print gene header with proper spacing and formatting
    if (knitr::is_latex_output()) {
      cat("\n\n\\subsection*{Gene: ", tables_result$gene_headers[gene_idx], "}\n\n", sep="")
    } else {
      cat("\n\n### Gene: ", tables_result$gene_headers[gene_idx], "\n\n", sep="")
    }
    
    comparison_data <- tables_result$comparison_tables[[gene_idx]]
    q_metadata <- tables_result$q_metadata[[gene_idx]]
    
    if (!is.null(comparison_data) && !is.null(q_metadata)) {
      # Create display labels for column names (for knitr::kable)
      q_values_available <- q_metadata$q_values_available
      # Filter to show only q=0.50, q=1.00, q=1.50
      q_indices <- which(sapply(q_values_available, function(qk) {
        q_num <- q_metadata$q_key_to_value[[qk]]
        q_num %in% c(0.50, 1.00, 1.50)
      }))
      q_values_to_show <- q_values_available[q_indices]
      
      col_display <- c(
        "Transcript",
        sapply(q_values_to_show, function(qk) {
          q_num <- q_metadata$q_key_to_value[[qk]]
          paste0("q=", sprintf("%.2f", q_num))
        }),
        "Direction Consistency"
      )
      
      # Select only these columns from comparison_data
      cols_to_keep <- c(1, q_indices + 1, ncol(comparison_data))
      comparison_data_filtered <- comparison_data[, cols_to_keep, drop = FALSE]
      
      # Render table with kableExtra
      table_output <- knitr::kable(comparison_data_filtered, 
                         digits = 3, 
                         col.names = col_display,
                         align = c("l", rep("c", ncol(comparison_data_filtered) - 2), "r"))
      print(table_output)
    }
    
    cat("\n")
  }
} else {
  cat("Multi-Q switching tables not available; skipping this section.\n")
}

## ----fig-3-transcript-jackknife-influence, echo=TRUE, results='hide', fig.width=12, fig.height=10, eval=TRUE, out.width="98%", fig.pos="H", fig.cap = "**Figure 3:** Transcript-level switching patterns via jackknife resampling. Rows q-values (0-2.0), columns transcripts. Red/blue indicates condition influence."----
# Generate multi-Q heatmaps for top 4 genes using S4 wrapper
plot_jis_delta(analysis, n_genes = 4)

## ----fig-4-transcript-abundance-heatmap, echo=TRUE, results='hide', fig.width=12, fig.height=8, eval=TRUE, out.width="98%", fig.pos="H", fig.cap = "**Figure 4:** Transcript abundance heatmap with hierarchical clustering. Rows are transcripts, columns are conditions. Blue low, red high expression."----
# Generate transcript abundance heatmap
plot_expression(analysis, top_n = 4, metric = "median")

## ----effect-size-multiq-------------------------------------------------------
# Computes pairwise information-theoretic distance (Tsallis divergence)
analysis <- calculate_divergence(analysis)

## ----divergence-assay-display, echo = FALSE, results="asis"-------------------
# Display the raw divergence computation results
if (exists("analysis")) {
    div_results <- results(analysis, type = "divergence")
    
    # Extract divergence data - format depends on how it's stored in divergence_results slot
    if (is.matrix(div_results) && nrow(div_results) > 0) {
        div_table <- div_results[1:min(4, nrow(div_results)), 1:min(3, ncol(div_results))]
        
        knitr::kable(
            div_table,
            caption = "**Table 5 | Pairwise Tsallis divergence estimates.** Divergence (distance) between conditions from Tsallis entropy framework. Columns: gene identifier; pairwise comparison; divergence value; confidence interval (95%). Ranked by magnitude.",
            digits = 5,
            row.names = TRUE
        )
    } else if (is.list(div_results) && length(div_results) > 0) {
        # If stored as list per q-value, extract first q-value results
        first_q_result <- div_results[[1]]
        if (is.matrix(first_q_result)) {
            div_table <- first_q_result[1:min(4, nrow(first_q_result)), 1:min(3, ncol(first_q_result))]
            
            knitr::kable(
                div_table,
                caption = "**Table 6 | Pairwise Tsallis divergence from list-based results.** Alternative representation of divergence metrics for leading genes. Columns: gene identifier; pairwise comparison; divergence value; uncertainty. Asymmetry ($D(A \\to B) \\ne D(B \\to A)$) reflects information-theoretic properties. Ordered by divergence magnitude.",
                digits = 5,
                row.names = TRUE
            )
        }
    }
}

## ----effect-size-merge, echo=TRUE---------------------------------------------
# Compute effect sizes
analysis <- calculate_effect_sizes(
    analysis,
    significance_threshold = 0.05,
    enrich_per_q_pattern = TRUE
)

# Extract top 6 genes by statistical significance
top_genes_result <- results(
    analysis, 
    type = "effect_sizes_divergence",
    top_n = 6,
    sort_by = "p_value_interaction"
)

print(top_genes_result)

## ----divergence-effect-sizes-table, echo=FALSE, results="asis"----------------
# Select key columns for display (effect sizes at key q-values, pattern, and divergence metrics)
display_cols <- c(
    "gene", "p_value_interaction", "slope_diff",
    "effect_size_D_q0_5", "effect_size_D_q1", "effect_size_D_q2",
    "per_q_pattern"
)

# Only keep columns that exist in the data
display_cols <- display_cols[display_cols %in% colnames(top_genes_result)]
top_genes_display <- top_genes_result[, display_cols, drop = FALSE]

# Rename columns for better display
col_names <- c(
    "Gene", "P-value", "Slope Diff",
    "D(q=0.5)", "D(q=1.0)", "D(q=2.0)",
    "Pattern"
)
colnames(top_genes_display) <- col_names[1:ncol(top_genes_display)]

# Display the result as a formatted table
knitr::kable(
    top_genes_display, 
    caption = "**Table 7 | Top 6 Genes by Effect Size (Tsallis Divergence).** Genes ranked by interaction p-value; columns show divergence at key q-values (0.5, 1.0, 2.0) and inferred pattern type (rare-driven, abundant-driven, or balanced).",
    digits = 4,
    row.names = FALSE
)

## ----effect-size-plot-multiq, fig.width=10, fig.height=6, out.width="98%", fig.align='center', fig.pos="H", fig.cap='**Figure 5:** Distribution of Tsallis divergence effect sizes across genes. X-axis shows divergence values; y-axis shows gene density. Threshold line indicates substantial divergence (D > 0.1) distinguishing signal genes from background.'----
# Visualize the distribution of Tsallis divergence effect sizes across genes
plot_obj <- plot_divergence_distribution(
    analysis, 
    threshold = 0.1
)

print(plot_obj)

## ----compare-q-spectra-plot, fig.width=10, fig.height=6, out.width='100%', fig.align='center', fig.cap='**Figure 6:** Q-spectrum curves for top genes by significance. Each line represents divergence as a function of q-value; shape reveals biological pattern (rare-driven, balanced, or abundant-driven).'----
# Visualize q-spectrum curves for top 4 genes by significance
p_multi <- plot_divergence_spectrum(
    analysis, 
    n_genes = 4, 
    use_pvalue_ranking = TRUE
)

print(p_multi)

## ----visualize-q-spectrum, fig.width=10, fig.height=5, out.width='100%', fig.align='center', fig.cap='**Figure 7:** Global divergence spectrum across all genes. Aggregated q-spectrum pattern shows which abundance scales (rare vs. abundant isoforms) are affected genome-wide.'----
# Visualize global divergence q-spectrum across all genes (aggregated)
# Plot is rendered directly by knitr (no file dependency)
p <- plot_divergence_spectrum(analysis)

print(p)

## ----session-info, results = 'markup'-----------------------------------------
sessionInfo()

