đŸ“˜ Website • Articles • Reference
FakeDataR makes safe, synthetic stand‑ins for real
datasets.
It mirrors types, factor levels,
NA/blank rates, and even handles sensitive
columns (IDs, emails, phones) via fake or
drop strategies. You can also generate fake tables from a
database schema without reading any rows.
# install.packages("devtools")
::install_github("zobaer09/FakeDataR") devtools
Generate a fake dataset that matches the structure of a real one:
library(FakeDataR)
<- generate_fake_data(mtcars, n = 200, seed = 1)
fake_mtc head(fake_mtc)
#> mpg cyl disp hp drat wt qsec vs
#> 1 16.63945 5.070033 335.2440 282.43325 4.623352 3.588993 17.57882 0.2396067
#> 2 19.14491 4.874581 145.2945 314.84395 2.834732 4.191491 20.72770 0.6477649
#> 3 23.86205 6.067187 453.7102 93.73714 4.867064 3.012021 22.34145 0.9756708
#> 4 31.74288 5.075802 431.0475 264.19953 4.376889 5.247958 20.15496 0.3779988
#> 5 15.13953 4.724673 449.4281 328.11103 3.352964 1.975893 20.39140 0.4641441
#> 6 31.51216 6.074305 361.2276 327.86627 4.229320 1.665920 21.62005 0.8122963
#> am gear carb
#> 1 0.13853856 3.123219 7.102635
#> 2 0.04752457 3.710643 7.770379
#> 3 0.03391887 4.154076 7.068414
#> 4 0.91608902 4.070063 4.064007
#> 5 0.84020039 4.208546 2.343565
#> 6 0.17887142 3.972298 1.576061
Validate that classes and basic patterns were preserved:
validate_fake(mtcars, fake_mtc)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 mpg numeric numeric TRUE 0 0
#> 2 cyl numeric numeric TRUE 0 0
#> 3 disp numeric numeric TRUE 0 0
#> 4 hp numeric numeric TRUE 0 0
#> 5 drat numeric numeric TRUE 0 0
#> 6 wt numeric numeric TRUE 0 0
#> 7 qsec numeric numeric TRUE 0 0
#> 8 vs numeric numeric TRUE 0 0
#> 9 am numeric numeric TRUE 0 0
#> 10 gear numeric numeric TRUE 0 0
#> 11 carb numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE NA NA NA
#> 3 TRUE NA NA NA
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> 6 TRUE NA NA NA
#> 7 TRUE NA NA NA
#> 8 TRUE NA NA NA
#> 9 TRUE NA NA NA
#> 10 TRUE NA NA NA
#> 11 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 TRUE
#> 7 TRUE
#> 8 TRUE
#> 9 TRUE
#> 10 TRUE
#> 11 TRUE
Preserve factor levels & numeric ranges (example with
palmerpenguins
):
if (requireNamespace("palmerpenguins", quietly = TRUE)) {
<- na.omit(palmerpenguins::penguins[, c("species","island","bill_length_mm","sex")])
peng <- generate_fake_data(peng, n = 400, seed = 11, category_mode = "preserve")
fake_peng validate_fake(peng, fake_peng)
}#> column class_original class_fake class_match na_prop_original
#> 1 species factor factor TRUE 0
#> 2 island factor factor TRUE 0
#> 3 bill_length_mm numeric numeric TRUE 0
#> 4 sex factor factor TRUE 0
#> na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1 0 TRUE 0 0 TRUE
#> 2 0 TRUE 0 0 TRUE
#> 3 0 TRUE NA NA NA
#> 4 0 TRUE 0 0 TRUE
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
#> 4 NA
Detect and handle PII by name. Use strategy "fake"
(default) or "drop"
:
<- data.frame(
df id = 1:50,
email = sprintf("user%03d@corp.com", 1:50),
phone = sprintf("(415) 555-%04d", 1:50),
spend = runif(50, 10, 500)
)
# Keep the columns but replace values with synthetic ones
<- generate_fake_data(
fake_keep n = 80, seed = 12,
df, sensitive_detect = TRUE, sensitive_strategy = "fake"
)intersect(df$email, fake_keep$email) # should be character(0)
#> character(0)
# Drop sensitive columns entirely
<- generate_fake_data(
fake_drop n = 80, seed = 13,
df, sensitive_detect = TRUE, sensitive_strategy = "drop"
)names(fake_drop) # no id/email/phone
#> [1] "spend"
Create fake rows using only the schema (types, nullability, etc.). This works even when you cannot access the underlying data.
library(DBI); library(RSQLite)
<- dbConnect(RSQLite::SQLite(), ":memory:")
con dbExecute(con, "
CREATE TABLE employees (
id INTEGER,
email TEXT,
phone TEXT,
is_active BOOLEAN,
hired_at TIMESTAMP,
salary NUMERIC,
dept TEXT
)
")
<- schema_from_db(con, "employees")
sch <- generate_fake_from_schema(sch, n = 50, seed = 14)
fake
str(fake$hired_at) # POSIXct
dbDisconnect(con)
Write fake data to common formats (CSV, RDS, Parquet) and optionally produce an LLM bundle containing a schema and README.
# CSV / RDS
export_fake(fake_mtc, file.path(tempdir(), "fake_mtc.csv"))
export_fake(fake_mtc, file.path(tempdir(), "fake_mtc.rds"))
# Parquet (requires the 'arrow' package)
# install.packages("arrow")
export_fake(fake_mtc, file.path(tempdir(), "fake_mtc.parquet"))
# End-to-end bundle
<- llm_bundle(
b n = 200, seed = 10, level = "high",
mtcars, formats = c("csv","rds"),
path = tempdir(), filename = "mtcars_fake",
write_prompt = TRUE, zip = TRUE
)$zip_path b
All generators respect seed
. Given the same inputs and a
fixed package version, results are reproducible:
<- generate_fake_data(CO2, n = 123, seed = 42)
a1 <- generate_fake_data(CO2, n = 123, seed = 42)
a2 identical(a1, a2)
#> [1] TRUE
Issues and pull requests are welcome! If you find a bug or have a feature request, please open an issue on GitHub.
This package is distributed under the terms of the license included
in the repository (see LICENSE
).