This vignette shows how to mirror the structure of real data with fully synthetic values, verify the structure, and produce an LLM-ready bundle.
# tiny input with a few likely sensitive fields
df <- data.frame(
id = sprintf("id%03d", 1:10),
email = paste0("a", 1:10, "@x.com"),
Progress = paste0(sample(80:100, 10, TRUE), "%"),
check.names = FALSE
)
orig <- prepare_input_data(df)
fake_priv <- generate_fake_with_privacy(
data = orig, n = 10, level = "low", seed = 1,
sensitive = c("id", "email"),
sensitive_detect = TRUE,
sensitive_strategy = "fake",
normalize = TRUE
)
# quick validation sample
head(validate_fake(orig, fake_priv), 5)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 id character integer FALSE 0 0
#> 2 email character character TRUE 0 0
#> 3 Progress numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE 0 NA NA
#> 2 TRUE 0 0 TRUE
#> 3 TRUE NA NA NA
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
library(FakeDataR)
# Basic fake from a data.frame
fake_mtc <- generate_fake_data(mtcars, n = 200, seed = 1)
validate_fake(mtcars, fake_mtc)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 mpg numeric numeric TRUE 0 0
#> 2 cyl numeric numeric TRUE 0 0
#> 3 disp numeric numeric TRUE 0 0
#> 4 hp numeric numeric TRUE 0 0
#> 5 drat numeric numeric TRUE 0 0
#> 6 wt numeric numeric TRUE 0 0
#> 7 qsec numeric numeric TRUE 0 0
#> 8 vs numeric numeric TRUE 0 0
#> 9 am numeric numeric TRUE 0 0
#> 10 gear numeric numeric TRUE 0 0
#> 11 carb numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE NA NA NA
#> 3 TRUE NA NA NA
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> 6 TRUE NA NA NA
#> 7 TRUE NA NA NA
#> 8 TRUE NA NA NA
#> 9 TRUE NA NA NA
#> 10 TRUE NA NA NA
#> 11 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 TRUE
#> 7 TRUE
#> 8 TRUE
#> 9 TRUE
#> 10 TRUE
#> 11 TRUE
fake_co2 <- generate_fake_data(as.data.frame(CO2), n = 200, seed = 2)
validate_fake(as.data.frame(CO2), fake_co2)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 Plant ordered/factor factor FALSE 0 0
#> 2 Type factor factor TRUE 0 0
#> 3 Treatment factor factor TRUE 0 0
#> 4 conc numeric numeric TRUE 0 0
#> 5 uptake numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE 0 0 TRUE
#> 2 TRUE 0 0 TRUE
#> 3 TRUE 0 0 TRUE
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 NA
#> 4 TRUE
#> 5 TRUE
fake_tg <- generate_fake_data(ToothGrowth, n = 120, seed = 3)
validate_fake(ToothGrowth, fake_tg)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 len numeric numeric TRUE 0 0
#> 2 supp factor factor TRUE 0 0
#> 3 dose numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE 0 0 TRUE
#> 3 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 NA
#> 3 TRUE
df_date <- data.frame(d = seq(as.Date("2020-01-01"), by = "day", length.out = 50))
fake_date <- generate_fake_data(df_date, n = 80, seed = 4)
str(fake_date$d)
#> Date[1:80], format: "2020-01-30" "2020-01-01" "2020-01-15" "2020-01-15" "2020-02-10" ...
dt <- data.frame(
when = seq.POSIXt(as.POSIXct("2023-05-01 00:00:00", tz = "America/New_York"),
by = "hour", length.out = 200)
)
fake_dt <- generate_fake_data(dt, n = 50, seed = 5)
str(fake_dt$when)
#> POSIXct[1:50], format: "2023-05-02 15:50:33" "2023-05-06 16:21:30" "2023-05-08 14:27:29" ...
range(fake_dt$when)
#> [1] "2023-05-01 02:52:55 EDT" "2023-05-09 00:13:36 EDT"
These chunks run only if the packages are installed.
if (requireNamespace("nycflights13", quietly = TRUE)) {
fl <- nycflights13::flights
set.seed(10)
fl_small <- fl[sample.int(nrow(fl), 2000), ] # smaller
fake_fl <- generate_fake_data(
fl_small, n = 500, seed = 10,
numeric_mode = "distribution"
)
head(validate_fake(fl_small, fake_fl), 5)
} else {
message("nycflights13 not installed - skipping.")
}
#> column class_original class_fake class_match na_prop_original
#> 1 year integer integer TRUE 0.000
#> 2 month integer integer TRUE 0.000
#> 3 day integer integer TRUE 0.000
#> 4 dep_time integer integer TRUE 0.027
#> 5 sched_dep_time integer integer TRUE 0.000
#> na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1 0.00 TRUE NA NA NA
#> 2 0.00 TRUE NA NA NA
#> 3 0.00 TRUE NA NA NA
#> 4 0.02 TRUE NA NA NA
#> 5 0.00 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
if (requireNamespace("palmerpenguins", quietly = TRUE)) {
peng <- na.omit(palmerpenguins::penguins[, c("species","island","bill_length_mm","sex")])
fake_peng <- generate_fake_data(
peng, n = 400, seed = 11,
category_mode = "preserve"
)
head(validate_fake(peng, fake_peng), 5)
} else {
message("palmerpenguins not installed - skipping.")
}
#> column class_original class_fake class_match na_prop_original
#> 1 species factor factor TRUE 0
#> 2 island factor factor TRUE 0
#> 3 bill_length_mm numeric numeric TRUE 0
#> 4 sex factor factor TRUE 0
#> na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1 0 TRUE 0 0 TRUE
#> 2 0 TRUE 0 0 TRUE
#> 3 0 TRUE NA NA NA
#> 4 0 TRUE 0 0 TRUE
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
#> 4 NA
# Optional package; make the chunk robust
if (requireNamespace("gapminder", quietly = TRUE)) {
set.seed(21)
gm <- gapminder::gapminder
# Keep it light if you want: gm <- gm[sample.int(nrow(gm), 2000), ]
fake_gm <- generate_fake_data(
gm, n = 800, seed = 21,
numeric_mode = "distribution", # nicer numeric spread
category_mode = "preserve" # keep factor levels
)
validate_fake(gm, fake_gm)
} else {
message("gapminder not installed; skipping demo.")
}
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 country factor factor TRUE 0 0
#> 2 continent factor factor TRUE 0 0
#> 3 year integer integer TRUE 0 0
#> 4 lifeExp numeric numeric TRUE 0 0
#> 5 pop integer integer TRUE 0 0
#> 6 gdpPercap numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE 0 0 TRUE
#> 2 TRUE 0 0 TRUE
#> 3 TRUE NA NA NA
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> 6 TRUE NA NA NA
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 TRUE
set.seed(12)
df_pii <- data.frame(
id = 1:100,
email = sprintf("user%03d@corp.com", 1:100),
phone = sprintf("(415) 555-%04d", 1:100),
spend = runif(100, 10, 500)
)
fake_keep <- generate_fake_data(
df_pii, n = 120,
sensitive_detect = TRUE,
sensitive_strategy = "fake"
)
fake_drop <- generate_fake_data(
df_pii, n = 120,
sensitive_detect = TRUE,
sensitive_strategy = "drop"
)
names(fake_keep) # expect id/email/phone present but synthetic
#> [1] "id" "email" "phone" "spend"
names(fake_drop) # expect only "spend"
#> [1] "spend"
b1 <- llm_bundle(
data = ToothGrowth, n = 150, level = "high", seed = 10,
formats = c("csv","rds"),
path = tempdir(), filename = "toothgrowth_fake",
write_prompt = TRUE, zip = TRUE
)
b1$schema_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake_schema.json"
b1$readme_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/README_FOR_LLM.txt"
b1$zip_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake.zip"