Privacy and validation

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

What the function does (Overview)

generate_fake_with_privacy() creates a synthetic copy of your data.
It then handles sensitive columns by name.

Level presets

level	category_mode	column_mode	numeric_mode
low	preserve	keep	range
medium	generic	generic	range
high	generic	generic	distribution

sensitive_detect auto-finds common PII by column name.
sensitive_strategy chooses how to treat those columns: "fake" (tokenize) or "drop" (remove).
You can also list sensitive columns yourself with sensitive = c("id","email", ...).

Levels and strategies

library(FakeDataR)

df <- data.frame(
  id    = 1:50,
  email = sprintf("u%02d@x.com", 1:50),
  phone = sprintf("555-01%02d", 1:50),
  dept  = sample(c("A","B","C"), 50, TRUE),
  spend = round(runif(50, 10, 200), 2),
  check.names = FALSE
)


# Auto-detect sensitive columns and fake them
# Strategy: fake sensitive fields (default)
fake_low <- generate_fake_with_privacy(
  data = df, n = 60, level = "low", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "fake",
  normalize = TRUE
)

# Auto-detect and drop sensitive columns
# Strategy: drop sensitive fields
fake_drop <- generate_fake_with_privacy(
  data = df, n = 60, level = "medium", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "drop",
  normalize = TRUE
)

names(fake_low)
#> [1] "id"    "email" "phone" "dept"  "spend"
names(fake_drop)
#> [1] "var4" "var5"

# Inspect privacy metadata
attr(fake_low,  "sensitive_columns")
#> [1] "id"    "email" "phone"
attr(fake_drop, "dropped_columns")
#> [1] "id"    "email" "phone"
attr(fake_low,  "name_map")
#>      id   email   phone    dept   spend 
#>    "id" "email" "phone"  "dept" "spend"

Explicit ‘sensitive’ vs auto-detect

You can fully control what’s sensitive. Here we turn off auto-detect and list columns ourselves:

fake_explicit <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = c("id","email","phone"),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
names(fake_explicit)
#> [1] "id"    "email" "phone" "dept"  "spend"
attr(fake_explicit, "sensitive_columns")
#> [1] "id"    "email" "phone"

Extending detection with your own patterns


# A broad, configurable pattern set
sensitive_patterns <- c(
  # direct IDs / names
  "^id$", "employee[_-]?id", "user(name|[_-]?id)?$", "full[_-]?name", "first[_-]?name", "last[_-]?name",
  # contact
  "email|e-mail", "phone|tel|mobile", "fax",
  # address / geo
  "address|street|road|avenue|apt|unit|suite|zip|postal|postcode|city|state|province|country",
  "lat(itude)?|lon(gitude)?|gps",
  # government IDs (international sampling)
  "RegId|ssn|sin|nin|aadhaar|aadhar|bvn|curp|dni|ced(ul|)+a|cpf|pan\\b|tin\\b|ein\\b|pesel|nin\\b",
  # licenses / travel docs
  "passport|visa|license|licence|driver|dl\\b|vin|plate",
  # finance / payments
  "iban|swift|bic|routing|sort[_-]?code|account|acct|bank",
  "credit|debit|card|cvv|cvc|pan[_-]?number",
  # auth / secrets / device
  "password|pass|pwd|pin|otp|secret|token|api[_-]?key|auth|bearer|session|cookie",
  "ip(_address)?|mac(_address)?|imei|imsi|serial|device|udid|android[_-]?id|idfa|gaid",
  # medical / patient
  "mrn|nhs|medicare|medicaid|patient|diagnosis",
  # birthdays
  "dob|date[_-]?of[_-]?birth|birth(day|date)",
  # education
  "student[_-]?id"
)

rx <- paste0("(?i)(", paste(sensitive_patterns, collapse = "|"), ")")
sens_cols <- names(df)[grepl(rx, names(df))]
sens_cols
#> [1] "id"    "email" "phone"

sens_cols <- names(df)[grepl(rx, names(df))]
fake_custom_detect <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = unique(c(sens_cols, "email")),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
attr(fake_custom_detect, "sensitive_columns")
#> [1] "id"    "email" "phone"

Validation

v1 <- validate_fake(df, fake_low)
head(v1, 5)
#>   column class_original class_fake class_match na_prop_original na_prop_fake
#> 1     id        integer    integer        TRUE                0            0
#> 2  email      character  character        TRUE                0            0
#> 3  phone      character  character        TRUE                0            0
#> 4   dept      character  character        TRUE                0            0
#> 5  spend        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                  NA              NA          NA
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                   0               0        TRUE
#> 4     TRUE                   0               0        TRUE
#> 5     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                 FALSE
#> 2                    NA
#> 3                    NA
#> 4                    NA
#> 5                  TRUE

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.