Privacy and validation

What the function does (Overview)

generate_fake_with_privacy() creates a synthetic copy of your data.
It then handles sensitive columns by name.

Level presets

level category_mode column_mode numeric_mode
low preserve keep range
medium generic generic range
high generic generic distribution

Levels and strategies

library(FakeDataR)

df <- data.frame(
  id    = 1:50,
  email = sprintf("u%02d@x.com", 1:50),
  phone = sprintf("555-01%02d", 1:50),
  dept  = sample(c("A","B","C"), 50, TRUE),
  spend = round(runif(50, 10, 200), 2),
  check.names = FALSE
)


# Auto-detect sensitive columns and fake them
# Strategy: fake sensitive fields (default)
fake_low <- generate_fake_with_privacy(
  data = df, n = 60, level = "low", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "fake",
  normalize = TRUE
)

# Auto-detect and drop sensitive columns
# Strategy: drop sensitive fields
fake_drop <- generate_fake_with_privacy(
  data = df, n = 60, level = "medium", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "drop",
  normalize = TRUE
)

names(fake_low)
#> [1] "id"    "email" "phone" "dept"  "spend"
names(fake_drop)
#> [1] "var4" "var5"

# Inspect privacy metadata
attr(fake_low,  "sensitive_columns")
#> [1] "id"    "email" "phone"
attr(fake_drop, "dropped_columns")
#> [1] "id"    "email" "phone"
attr(fake_low,  "name_map")
#>      id   email   phone    dept   spend 
#>    "id" "email" "phone"  "dept" "spend"

Explicit ‘sensitive’ vs auto-detect

You can fully control what’s sensitive. Here we turn off auto-detect and list columns ourselves:

fake_explicit <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = c("id","email","phone"),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
names(fake_explicit)
#> [1] "id"    "email" "phone" "dept"  "spend"
attr(fake_explicit, "sensitive_columns")
#> [1] "id"    "email" "phone"

Extending detection with your own patterns


# A broad, configurable pattern set
sensitive_patterns <- c(
  # direct IDs / names
  "^id$", "employee[_-]?id", "user(name|[_-]?id)?$", "full[_-]?name", "first[_-]?name", "last[_-]?name",
  # contact
  "email|e-mail", "phone|tel|mobile", "fax",
  # address / geo
  "address|street|road|avenue|apt|unit|suite|zip|postal|postcode|city|state|province|country",
  "lat(itude)?|lon(gitude)?|gps",
  # government IDs (international sampling)
  "RegId|ssn|sin|nin|aadhaar|aadhar|bvn|curp|dni|ced(ul|)+a|cpf|pan\\b|tin\\b|ein\\b|pesel|nin\\b",
  # licenses / travel docs
  "passport|visa|license|licence|driver|dl\\b|vin|plate",
  # finance / payments
  "iban|swift|bic|routing|sort[_-]?code|account|acct|bank",
  "credit|debit|card|cvv|cvc|pan[_-]?number",
  # auth / secrets / device
  "password|pass|pwd|pin|otp|secret|token|api[_-]?key|auth|bearer|session|cookie",
  "ip(_address)?|mac(_address)?|imei|imsi|serial|device|udid|android[_-]?id|idfa|gaid",
  # medical / patient
  "mrn|nhs|medicare|medicaid|patient|diagnosis",
  # birthdays
  "dob|date[_-]?of[_-]?birth|birth(day|date)",
  # education
  "student[_-]?id"
)

rx <- paste0("(?i)(", paste(sensitive_patterns, collapse = "|"), ")")
sens_cols <- names(df)[grepl(rx, names(df))]
sens_cols
#> [1] "id"    "email" "phone"

sens_cols <- names(df)[grepl(rx, names(df))]
fake_custom_detect <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = unique(c(sens_cols, "email")),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
attr(fake_custom_detect, "sensitive_columns")
#> [1] "id"    "email" "phone"

Validation

v1 <- validate_fake(df, fake_low)
head(v1, 5)
#>   column class_original class_fake class_match na_prop_original na_prop_fake
#> 1     id        integer    integer        TRUE                0            0
#> 2  email      character  character        TRUE                0            0
#> 3  phone      character  character        TRUE                0            0
#> 4   dept      character  character        TRUE                0            0
#> 5  spend        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                  NA              NA          NA
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                   0               0        TRUE
#> 4     TRUE                   0               0        TRUE
#> 5     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                 FALSE
#> 2                    NA
#> 3                    NA
#> 4                    NA
#> 5                  TRUE