regcal_example

Introduction

The RegCalReliab package provides a unified framework for regression calibration under both external and internal reliability study designs. External reliability studies collect replicate measurements on a separate sample, while internal reliability studies collect replicates within the main study cohort. In both settings, regression calibration replaces the error-prone exposures with their estimated conditional expectations, thereby correcting bias and improving confidence interval coverage.

This document demonstrates the use of RegCalReliab through simulation studies under a logistic regression model. We generate data with two error-prone exposures (z1, z2) and two error-free covariates (W1, W2), with a true odds ratio of 1.5 for each exposure. Results are compared between naïve analyses (ignoring measurement error) and regression calibration, highlighting the bias correction and improved inference provided by the method.

library(RegCalReliab)

Data Simulation for External

1. Setup

In this section we load packages, fix the seed, and define helper functions. We set the true slope \(\beta\) = log(1.5), corresponding to a true odds ratio (OR) of 1.5 for each error-prone exposure.

library(mgcv)
#> Loading required package: nlme
#> This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
set.seed(123)

# Helper for measurement error
add_err = function(v, sd = sqrt(0.4)) v + rnorm(length(v), 0, sd)

# True coefficient (on log-odds scale) and OR
beta = log(1.5)
OR_true = 1.5

2. One simulation replicate for External Logistic

A single call to simulate_once() generates one dataset and fits regression calibration.

simulate_once = function() {
  # ---- True covariates ----
  x = mgcv::rmvn(3000, c(0,0), matrix(c(1,0.3,0.3,1), 2))
  
  # Error-free covariates (W1 = continuous, W2 = binary)
  W1_main = rnorm(1500)
  W2_main = rbinom(1500, 1, 0.5)
  W1_rep  = rnorm(1500)
  W2_rep  = rbinom(1500, 1, 0.5)
  
  # ---- Main study error-prone Z ----
  z.main = x[1:1500, 1:2] + matrix(rnorm(1500*2, 0, sqrt(0.4)), 1500, 2)
  colnames(z.main) = c("z1","z2")
  
  # ---- External replicates for Z ----
  z1_rep = rbind(
    cbind(add_err(x[1501:2000, 1]), add_err(x[1501:2000, 1]), NA, NA),
    cbind(add_err(x[2001:2400, 1]), add_err(x[2001:2400, 1]), add_err(x[2001:2400, 1]), NA),
    cbind(add_err(x[2401:3000, 1]), add_err(x[2401:3000, 1]), add_err(x[2401:3000, 1]), add_err(x[2401:3000, 1]))
  )
  colnames(z1_rep) = paste0("z1_", 1:4)
  
  z2_rep = rbind(
    cbind(add_err(x[1501:2000, 2]), add_err(x[1501:2000, 2]), NA, NA),
    cbind(add_err(x[2001:2400, 2]), add_err(x[2001:2400, 2]), add_err(x[2001:2400, 2]), NA),
    cbind(add_err(x[2401:3000, 2]), add_err(x[2401:3000, 2]), add_err(x[2401:3000, 2]), add_err(x[2401:3000, 2]))
  )
  colnames(z2_rep) = paste0("z2_", 1:4)
  
  
  # ---- Outcome ----
  p = plogis(-2.3 + beta*rowSums(x[1:1500, ]) + 0.5*W1_main - 0.3*W2_main)
  Y = rbinom(1500, 1, p)
  
  main_data = data.frame(
    Y = Y,
    z1 = z.main[, "z1"],
    z2 = z.main[, "z2"],
    W1 = W1_main,
    W2 = W2_main
  )
  
  rep_data = data.frame(z1_rep, z2_rep, W1 = W1_rep, W2 = W2_rep, check.names = FALSE)
  
  # ---- Regression Calibration ----
  res = RC_ExReliab(
    formula = Y ~ z1(z1_1, z1_2, z1_3, z1_4) + z2(z2_1, z2_2, z2_3, z2_4) + W1 + W2,
    main_data = main_data,
    rep_data = rep_data,
    link = "logistic"
  )
  
  return(res)
}

3. 100 simulation replicate

We repeat the entire simulation 100 times. Each run returns a results object containing two tables:

uncorrected = naïve logistic regression ignoring measurement error.

corrected = regression calibration estimates adjusted for error.

B = 100
results_list = replicate(B, simulate_once(), simplify = FALSE)

# Extract naive + corrected tables
naive_tabs = lapply(results_list, function(x) x$uncorrected)
corrected_tabs = lapply(results_list, function(x) x$corrected)

4. Average estimates across simulations

We compute the Monte Carlo average of the parameter estimates across the 100 runs.

avg_naive = Reduce("+", naive_tabs) / B
avg_corrected = Reduce("+", corrected_tabs) / B

cat("\nAverage Naive Logistic Estimates (B = ", B, "):\n", sep = "")
#> 
#> Average Naive Logistic Estimates (B = 100):
print(round(avg_naive, 5))
#>             Estimate Std. Error   z value Pr(>|z|)      OR  CI.low CI.high
#> (Intercept) -2.42815    0.10346 -23.48130  0.00000 0.08867 0.07247 0.10850
#> z1           0.31217    0.07817   3.98303  0.00788 1.37202 1.17689 1.59956
#> z2           0.30522    0.07817   3.90866  0.00644 1.36085 1.16754 1.58622
#> W1           0.48754    0.09118   5.34146  0.00003 1.63443 1.36664 1.95482
#> W2          -0.29440    0.17911  -1.63803  0.20679 0.75665 0.53294 1.07444

cat("\nAverage Corrected Logistic Estimates (B = ", B, "):\n", sep = "")
#> 
#> Average Corrected Logistic Estimates (B = 100):
print(round(avg_corrected, 5))
#>             Estimate Std. Error   z value Pr(>|z|)      OR  CI.low CI.high
#> (Intercept) -2.42815    0.10358 -23.46051  2.00000 0.08867 0.07245 0.10853
#> z1           0.40940    0.11609   3.52319  0.02041 1.51921 1.20951 1.90864
#> z2           0.39907    0.11638   3.43776  0.01553 1.50019 1.19418 1.88504
#> W1           0.48707    0.09112   5.35650  0.00003 1.63355 1.36619 1.95351
#> W2          -0.29554    0.17953  -1.64100  1.76609 0.75584 0.53191 1.07419

5. Coverage-rate (CR) calculation

Finally, we evaluate whether the true OR = 1.5 lies within the 95% confidence intervals.

inside_ci = function(tab, i, truth = OR_true) {
  ci = tab[i , c("CI.low", "CI.high")]
  ci[1] <= truth && truth <= ci[2]
}

row_z1 = which(rownames(avg_naive) == "z1")
row_z2 = which(rownames(avg_naive) == "z2")

coverage = function(tab_list, row) {
  mean(sapply(tab_list, function(tab) inside_ci(tab, row))) * 100
}

cov_z1_naive = coverage(naive_tabs, row_z1)
cov_z1_corr = coverage(corrected_tabs, row_z1)
cov_z2_naive = coverage(naive_tabs, row_z2)
cov_z2_corr = coverage(corrected_tabs, row_z2)

cat("\nCoverage of TRUE OR = 1.5 for error-prone exposure z1:\n")
#> 
#> Coverage of TRUE OR = 1.5 for error-prone exposure z1:
cat(sprintf("  • Naive                 : %5.1f %%\n", cov_z1_naive))
#>   • Naive                 :  77.0 %
cat(sprintf("  • Regression Calibration: %5.1f %%\n", cov_z1_corr))
#>   • Regression Calibration:  92.0 %

cat("\nCoverage of TRUE OR = 1.5 for error-prone exposure z2:\n")
#> 
#> Coverage of TRUE OR = 1.5 for error-prone exposure z2:
cat(sprintf("  • Naive                 : %5.1f %%\n", cov_z2_naive))
#>   • Naive                 :  76.0 %
cat(sprintf("  • Regression Calibration: %5.1f %%\n", cov_z2_corr))
#>   • Regression Calibration:  96.0 %

Data Simulation for Internal

1. Setup

In this section we load packages, fix the seed, and define helper functions. We set the true slope \(\beta\) = log(1.5), corresponding to a true odds ratio (OR) of 1.5 for each error-prone exposure.

library(mgcv)
set.seed(123)

# Helper for measurement error
add_err = function(v, sd = sqrt(0.4)) v + rnorm(length(v), 0, sd)

# True slope and OR
beta = log(1.5)
OR_true = 1.5

2. One simulation replicate for Internal Logistic

Here we generate one dataset with both main study and reliability information. Replicates are included in the same main_data frame.

simulate_once = function() {
  # ---- True covariates ----
  x = mgcv::rmvn(3000, c(0,0,0),
                  matrix(c(1,0.3,0.2,
                           0.3,1,0.5,
                           0.2,0.5,1), nrow = 3))
  
  # Binary W2 depends on x1
  w2 = sapply(x[,1], function(t) {
    if (t > median(x[,1])) rbinom(1,1,0.5) else rbinom(1,1,0.3)
  })
  
  # Error-free covariates
  W = cbind(x[,3], w2)
  colnames(W) = c("W1", "W2")
  
  # ---- Replicate design ----
  r = c(rep(1,1500), rep(2,500), rep(3,400), rep(4,600))
  
  # Replicates for z1
  z1 = rbind(
    cbind(add_err(x[1:1500, 1]), NA, NA, NA),
    cbind(add_err(x[1501:2000, 1]), add_err(x[1501:2000, 1]), NA, NA),
    cbind(add_err(x[2001:2400, 1]), add_err(x[2001:2400, 1]), add_err(x[2001:2400, 1]), NA),
    cbind(add_err(x[2401:3000, 1]), add_err(x[2401:3000, 1]), add_err(x[2401:3000, 1]), add_err(x[2401:3000, 1]))
  )
  colnames(z1) = paste0("z1_",1:4)
  
  # Replicates for z2
  z2 = rbind(
    cbind(add_err(x[1:1500, 2]), NA, NA, NA),
    cbind(add_err(x[1501:2000, 2]), add_err(x[1501:2000, 2]), NA, NA),
    cbind(add_err(x[2001:2400, 2]), add_err(x[2001:2400, 2]), add_err(x[2001:2400, 2]), NA),
    cbind(add_err(x[2401:3000, 2]), add_err(x[2401:3000, 2]), add_err(x[2401:3000, 2]), add_err(x[2401:3000, 2]))
  )
  colnames(z2) = paste0("z2_",1:4)
  
  # ---- Outcome ----
  p = plogis(-2.65 + beta*rowSums(x[,1:3]) + beta*w2)
  Y = rbinom(3000, 1, p)
  
  # ---- Main data with outcome, replicates, covariates ----
  main_data = data.frame(Y, z1, z2, W)
  
  # ---- Regression calibration ----
  res = RC_InReliab(
    formula   = Y ~ myz1(z1_1, z1_2, z1_3, z1_4) +
                  myz2(z2_1, z2_2, z2_3, z2_4) +
                  W1 + W2,
    main_data = main_data,
    link      = "logistic"
  )
  
  return(res)
}

3. 100 simulation replicate

We repeat the entire simulation 100 times. Each run returns a results object containing two tables:

uncorrected = naïve logistic regression ignoring measurement error.

corrected = regression calibration estimates adjusted for error.

B = 100
results_list = replicate(B, simulate_once(), simplify = FALSE)

# Collect naïve + corrected tables
naive_tabs = lapply(results_list, function(x) x$uncorrected)
corrected_tabs = lapply(results_list, function(x) x$corrected)

4. Average estimates across simulations

We compute the Monte Carlo average of the parameter estimates across the 100 runs.

avg_naive = Reduce("+", naive_tabs) / B
avg_corrected = Reduce("+", corrected_tabs) / B

cat("\nAverage Naive Logistic Estimates (B = ", B, "):\n", sep = "")
#> 
#> Average Naive Logistic Estimates (B = 100):
print(round(avg_naive, 5))
#>             Estimate Std. Error   z value Pr(>|z|)      OR  CI.low CI.high
#> (Intercept) -2.46347    0.07564 -32.57857  0.00000 0.08536 0.07362 0.09896
#> myz1         0.32896    0.05891   5.58205  0.00001 1.39137 1.23960 1.56175
#> myz2         0.31140    0.06348   4.90541  0.00038 1.36817 1.20808 1.54949
#> W1           0.45731    0.07129   6.41244  0.00000 1.58342 1.37684 1.82102
#> W2           0.44009    0.12598   3.49568  0.01636 1.56768 1.22472 2.00674

cat("\nAverage Corrected Logistic Estimates (B = ", B, "):\n", sep = "")
#> 
#> Average Corrected Logistic Estimates (B = 100):
print(round(avg_corrected, 5))
#>             Estimate Std. Error   z value Pr(>|z|)      OR  CI.low CI.high
#> (Intercept) -2.46506    0.07559 -32.62875  2.00000 0.08522 0.07352 0.09880
#> myz1         0.40418    0.07632   5.29511  0.00003 1.50144 1.29271 1.74396
#> myz2         0.39760    0.08593   4.62977  0.00093 1.49393 1.26225 1.76824
#> W1           0.40044    0.07559   5.29969  0.00005 1.49629 1.29017 1.73539
#> W2           0.41099    0.12687   3.24107  0.02646 1.52313 1.18781 1.95316

5. Coverage-rate (CR) calculation

Finally, we evaluate whether the true OR = 1.5 lies within the 95% confidence intervals.

inside_ci = function(tab, i, truth = OR_true) {
  ci = tab[i , c("CI.low", "CI.high")]
  ci[1] <= truth && truth <= ci[2]
}

row_z1 = which(rownames(avg_naive) == "myz1")
row_z2 = which(rownames(avg_naive) == "myz2")

coverage = function(tab_list, row) {
  mean(sapply(tab_list, function(tab) inside_ci(tab, row))) * 100
}

cov_z1_naive = coverage(naive_tabs, row_z1)
cov_z1_corr = coverage(corrected_tabs, row_z1)
cov_z2_naive = coverage(naive_tabs, row_z2)
cov_z2_corr = coverage(corrected_tabs, row_z2)

cat("\nCoverage of TRUE OR = 1.5 for error-prone exposure z1:\n")
#> 
#> Coverage of TRUE OR = 1.5 for error-prone exposure z1:
cat(sprintf("  • Naive                 : %5.1f %%\n", cov_z1_naive))
#>   • Naive                 :  73.0 %
cat(sprintf("  • Regression Calibration: %5.1f %%\n", cov_z1_corr))
#>   • Regression Calibration:  96.0 %

cat("\nCoverage of TRUE OR = 1.5 for error-prone exposure z2:\n")
#> 
#> Coverage of TRUE OR = 1.5 for error-prone exposure z2:
cat(sprintf("  • Naive                 : %5.1f %%\n", cov_z2_naive))
#>   • Naive                 :  70.0 %
cat(sprintf("  • Regression Calibration: %5.1f %%\n", cov_z2_corr))
#>   • Regression Calibration:  92.0 %

Summary

This document illustrates the use of the RegCalReliab package for regression calibration under measurement error in logistic regression models. Two study designs are demonstrated:

In both cases, we simulate data with two error-prone exposures (z1, z2) and two error-free covariates (W1, W2). The true odds ratio for each exposure is set to 1.5.

Key findings from the simulations:

Overall, the document demonstrates that regression calibration is an effective and practical method for addressing measurement error in regression models, and the RegCalReliab package provides a unified framework for implementation in both external and internal reliability studies.