FakeDataR: Getting started

This vignette shows how to mirror the structure of real data with fully synthetic values, verify the structure, and produce an LLM-ready bundle.

Quick start

# tiny input with a few likely sensitive fields
df <- data.frame(
  id = sprintf("id%03d", 1:10),
  email = paste0("a", 1:10, "@x.com"),
  Progress = paste0(sample(80:100, 10, TRUE), "%"),
  check.names = FALSE
)

orig <- prepare_input_data(df)

fake_priv <- generate_fake_with_privacy(
  data = orig, n = 10, level = "low", seed = 1,
  sensitive = c("id", "email"),
  sensitive_detect = TRUE,
  sensitive_strategy = "fake",
  normalize = TRUE
)

# quick validation sample
head(validate_fake(orig, fake_priv), 5)
#>     column class_original class_fake class_match na_prop_original na_prop_fake
#> 1       id      character    integer       FALSE                0            0
#> 2    email      character  character        TRUE                0            0
#> 3 Progress        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                   0              NA          NA
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                  TRUE

library(FakeDataR)

# Basic fake from a data.frame
fake_mtc <- generate_fake_data(mtcars, n = 200, seed = 1)
validate_fake(mtcars, fake_mtc)
#>    column class_original class_fake class_match na_prop_original na_prop_fake
#> 1     mpg        numeric    numeric        TRUE                0            0
#> 2     cyl        numeric    numeric        TRUE                0            0
#> 3    disp        numeric    numeric        TRUE                0            0
#> 4      hp        numeric    numeric        TRUE                0            0
#> 5    drat        numeric    numeric        TRUE                0            0
#> 6      wt        numeric    numeric        TRUE                0            0
#> 7    qsec        numeric    numeric        TRUE                0            0
#> 8      vs        numeric    numeric        TRUE                0            0
#> 9      am        numeric    numeric        TRUE                0            0
#> 10   gear        numeric    numeric        TRUE                0            0
#> 11   carb        numeric    numeric        TRUE                0            0
#>    na_match blank_prop_original blank_prop_fake blank_match
#> 1      TRUE                  NA              NA          NA
#> 2      TRUE                  NA              NA          NA
#> 3      TRUE                  NA              NA          NA
#> 4      TRUE                  NA              NA          NA
#> 5      TRUE                  NA              NA          NA
#> 6      TRUE                  NA              NA          NA
#> 7      TRUE                  NA              NA          NA
#> 8      TRUE                  NA              NA          NA
#> 9      TRUE                  NA              NA          NA
#> 10     TRUE                  NA              NA          NA
#> 11     TRUE                  NA              NA          NA
#>    range_within_original
#> 1                   TRUE
#> 2                   TRUE
#> 3                   TRUE
#> 4                   TRUE
#> 5                   TRUE
#> 6                   TRUE
#> 7                   TRUE
#> 8                   TRUE
#> 9                   TRUE
#> 10                  TRUE
#> 11                  TRUE

Factors, characters, and numerics

fake_co2 <- generate_fake_data(as.data.frame(CO2), n = 200, seed = 2)
validate_fake(as.data.frame(CO2), fake_co2)
#>      column class_original class_fake class_match na_prop_original na_prop_fake
#> 1     Plant ordered/factor     factor       FALSE                0            0
#> 2      Type         factor     factor        TRUE                0            0
#> 3 Treatment         factor     factor        TRUE                0            0
#> 4      conc        numeric    numeric        TRUE                0            0
#> 5    uptake        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                   0               0        TRUE
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                   0               0        TRUE
#> 4     TRUE                  NA              NA          NA
#> 5     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                    NA
#> 4                  TRUE
#> 5                  TRUE

fake_tg <- generate_fake_data(ToothGrowth, n = 120, seed = 3)
validate_fake(ToothGrowth, fake_tg)
#>   column class_original class_fake class_match na_prop_original na_prop_fake
#> 1    len        numeric    numeric        TRUE                0            0
#> 2   supp         factor     factor        TRUE                0            0
#> 3   dose        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                  NA              NA          NA
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                  TRUE
#> 2                    NA
#> 3                  TRUE

Dates and POSIXct (time zones preserved)

df_date <- data.frame(d = seq(as.Date("2020-01-01"), by = "day", length.out = 50))
fake_date <- generate_fake_data(df_date, n = 80, seed = 4)
str(fake_date$d)
#>  Date[1:80], format: "2020-01-30" "2020-01-01" "2020-01-15" "2020-01-15" "2020-02-10" ...

dt <- data.frame(
  when = seq.POSIXt(as.POSIXct("2023-05-01 00:00:00", tz = "America/New_York"),
                    by = "hour", length.out = 200)
)
fake_dt <- generate_fake_data(dt, n = 50, seed = 5)
str(fake_dt$when)
#>  POSIXct[1:50], format: "2023-05-02 15:50:33" "2023-05-06 16:21:30" "2023-05-08 14:27:29" ...
range(fake_dt$when)
#> [1] "2023-05-01 02:52:55 EDT" "2023-05-09 00:13:36 EDT"

Public datasets - wrap in guards, trim sizes

These chunks run only if the packages are installed.


if (requireNamespace("nycflights13", quietly = TRUE)) {
  fl <- nycflights13::flights
  set.seed(10)
  fl_small <- fl[sample.int(nrow(fl), 2000), ]  # smaller
  fake_fl <- generate_fake_data(
    fl_small, n = 500, seed = 10,
    numeric_mode = "distribution"
  )
  head(validate_fake(fl_small, fake_fl), 5)
} else {
  message("nycflights13 not installed - skipping.")
}
#>           column class_original class_fake class_match na_prop_original
#> 1           year        integer    integer        TRUE            0.000
#> 2          month        integer    integer        TRUE            0.000
#> 3            day        integer    integer        TRUE            0.000
#> 4       dep_time        integer    integer        TRUE            0.027
#> 5 sched_dep_time        integer    integer        TRUE            0.000
#>   na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1         0.00     TRUE                  NA              NA          NA
#> 2         0.00     TRUE                  NA              NA          NA
#> 3         0.00     TRUE                  NA              NA          NA
#> 4         0.02     TRUE                  NA              NA          NA
#> 5         0.00     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                  TRUE
#> 2                  TRUE
#> 3                  TRUE
#> 4                  TRUE
#> 5                  TRUE

if (requireNamespace("palmerpenguins", quietly = TRUE)) {
  peng <- na.omit(palmerpenguins::penguins[, c("species","island","bill_length_mm","sex")])
  fake_peng <- generate_fake_data(
    peng, n = 400, seed = 11,
    category_mode = "preserve"
  )
  head(validate_fake(peng, fake_peng), 5)
} else {
  message("palmerpenguins not installed - skipping.")
}
#>           column class_original class_fake class_match na_prop_original
#> 1        species         factor     factor        TRUE                0
#> 2         island         factor     factor        TRUE                0
#> 3 bill_length_mm        numeric    numeric        TRUE                0
#> 4            sex         factor     factor        TRUE                0
#>   na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1            0     TRUE                   0               0        TRUE
#> 2            0     TRUE                   0               0        TRUE
#> 3            0     TRUE                  NA              NA          NA
#> 4            0     TRUE                   0               0        TRUE
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                  TRUE
#> 4                    NA

Gapminder demo

# Optional package; make the chunk robust
if (requireNamespace("gapminder", quietly = TRUE)) {
  set.seed(21)
  gm <- gapminder::gapminder
  # Keep it light if you want: gm <- gm[sample.int(nrow(gm), 2000), ]

  fake_gm <- generate_fake_data(
    gm, n = 800, seed = 21,
    numeric_mode = "distribution",  # nicer numeric spread
    category_mode = "preserve"      # keep factor levels
  )

  validate_fake(gm, fake_gm)
} else {
  message("gapminder not installed; skipping demo.")
}
#>      column class_original class_fake class_match na_prop_original na_prop_fake
#> 1   country         factor     factor        TRUE                0            0
#> 2 continent         factor     factor        TRUE                0            0
#> 3      year        integer    integer        TRUE                0            0
#> 4   lifeExp        numeric    numeric        TRUE                0            0
#> 5       pop        integer    integer        TRUE                0            0
#> 6 gdpPercap        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                   0               0        TRUE
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                  NA              NA          NA
#> 4     TRUE                  NA              NA          NA
#> 5     TRUE                  NA              NA          NA
#> 6     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                  TRUE
#> 4                  TRUE
#> 5                  TRUE
#> 6                  TRUE

Sensitive columns: fake vs drop

set.seed(12)
df_pii <- data.frame(
  id    = 1:100,
  email = sprintf("user%03d@corp.com", 1:100),
  phone = sprintf("(415) 555-%04d", 1:100),
  spend = runif(100, 10, 500)
)

fake_keep <- generate_fake_data(
  df_pii, n = 120,
  sensitive_detect   = TRUE,
  sensitive_strategy = "fake"
)
fake_drop <- generate_fake_data(
  df_pii, n = 120,
  sensitive_detect   = TRUE,
  sensitive_strategy = "drop"
)

names(fake_keep)        # expect id/email/phone present but synthetic
#> [1] "id"    "email" "phone" "spend"
names(fake_drop)        # expect only "spend"
#> [1] "spend"

LLM bundle: data + schema + README (+ optional ZIP)

b1 <- llm_bundle(
  data = ToothGrowth, n = 150, level = "high", seed = 10,
  formats = c("csv","rds"),
  path = tempdir(), filename = "toothgrowth_fake",
  write_prompt = TRUE, zip = TRUE
)
b1$schema_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake_schema.json"
b1$readme_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/README_FOR_LLM.txt"
b1$zip_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake.zip"

Parquet export (optional)

if (requireNamespace("arrow", quietly = TRUE)) {
  fake_air <- generate_fake_data(airquality, n = 400, seed = 20)
  export_fake(fake_air, file.path(tempdir(), "air.parquet"))
} else {
  message("arrow not installed - skipping Parquet export.")
}

Reproducibility

a1 <- generate_fake_data(CO2, n = 123, seed = 42)
a2 <- generate_fake_data(CO2, n = 123, seed = 42)
identical(a1, a2)
#> [1] TRUE

big <- data.frame(
  a = runif(2e5),
  b = sample(letters, 2e5, TRUE),
  c = as.Date("2020-01-01") + sample.int(3000, 2e5, TRUE)
)
system.time({
  fake_big <- generate_fake_data(big, n = 2e5, seed = 99)
})