The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Privacy and validation

What the function does (Overview)

generate_fake_with_privacy() creates a synthetic copy of your data.
It then handles sensitive columns by name.

Level presets

level category_mode column_mode numeric_mode
low preserve keep range
medium generic generic range
high generic generic distribution

Levels and strategies

library(FakeDataR)

df <- data.frame(
  id    = 1:50,
  email = sprintf("u%02d@x.com", 1:50),
  phone = sprintf("555-01%02d", 1:50),
  dept  = sample(c("A","B","C"), 50, TRUE),
  spend = round(runif(50, 10, 200), 2),
  check.names = FALSE
)


# Auto-detect sensitive columns and fake them
# Strategy: fake sensitive fields (default)
fake_low <- generate_fake_with_privacy(
  data = df, n = 60, level = "low", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "fake",
  normalize = TRUE
)

# Auto-detect and drop sensitive columns
# Strategy: drop sensitive fields
fake_drop <- generate_fake_with_privacy(
  data = df, n = 60, level = "medium", seed = 1,
  sensitive_detect = TRUE, sensitive_strategy = "drop",
  normalize = TRUE
)

names(fake_low)
#> [1] "id"    "email" "phone" "dept"  "spend"
names(fake_drop)
#> [1] "var4" "var5"

# Inspect privacy metadata
attr(fake_low,  "sensitive_columns")
#> [1] "id"    "email" "phone"
attr(fake_drop, "dropped_columns")
#> [1] "id"    "email" "phone"
attr(fake_low,  "name_map")
#>      id   email   phone    dept   spend 
#>    "id" "email" "phone"  "dept" "spend"

Explicit ‘sensitive’ vs auto-detect

You can fully control what’s sensitive. Here we turn off auto-detect and list columns ourselves:

fake_explicit <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = c("id","email","phone"),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
names(fake_explicit)
#> [1] "id"    "email" "phone" "dept"  "spend"
attr(fake_explicit, "sensitive_columns")
#> [1] "id"    "email" "phone"

Extending detection with your own patterns


# A broad, configurable pattern set
sensitive_patterns <- c(
  # direct IDs / names
  "^id$", "employee[_-]?id", "user(name|[_-]?id)?$", "full[_-]?name", "first[_-]?name", "last[_-]?name",
  # contact
  "email|e-mail", "phone|tel|mobile", "fax",
  # address / geo
  "address|street|road|avenue|apt|unit|suite|zip|postal|postcode|city|state|province|country",
  "lat(itude)?|lon(gitude)?|gps",
  # government IDs (international sampling)
  "RegId|ssn|sin|nin|aadhaar|aadhar|bvn|curp|dni|ced(ul|)+a|cpf|pan\\b|tin\\b|ein\\b|pesel|nin\\b",
  # licenses / travel docs
  "passport|visa|license|licence|driver|dl\\b|vin|plate",
  # finance / payments
  "iban|swift|bic|routing|sort[_-]?code|account|acct|bank",
  "credit|debit|card|cvv|cvc|pan[_-]?number",
  # auth / secrets / device
  "password|pass|pwd|pin|otp|secret|token|api[_-]?key|auth|bearer|session|cookie",
  "ip(_address)?|mac(_address)?|imei|imsi|serial|device|udid|android[_-]?id|idfa|gaid",
  # medical / patient
  "mrn|nhs|medicare|medicaid|patient|diagnosis",
  # birthdays
  "dob|date[_-]?of[_-]?birth|birth(day|date)",
  # education
  "student[_-]?id"
)

rx <- paste0("(?i)(", paste(sensitive_patterns, collapse = "|"), ")")
sens_cols <- names(df)[grepl(rx, names(df))]
sens_cols
#> [1] "id"    "email" "phone"

sens_cols <- names(df)[grepl(rx, names(df))]
fake_custom_detect <- generate_fake_with_privacy(
  data = df, n = 60, seed = 1,
  sensitive = unique(c(sens_cols, "email")),
  sensitive_detect = FALSE,
  sensitive_strategy = "fake",
  normalize = TRUE
)
attr(fake_custom_detect, "sensitive_columns")
#> [1] "id"    "email" "phone"

Validation

v1 <- validate_fake(df, fake_low)
head(v1, 5)
#>   column class_original class_fake class_match na_prop_original na_prop_fake
#> 1     id        integer    integer        TRUE                0            0
#> 2  email      character  character        TRUE                0            0
#> 3  phone      character  character        TRUE                0            0
#> 4   dept      character  character        TRUE                0            0
#> 5  spend        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                  NA              NA          NA
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                   0               0        TRUE
#> 4     TRUE                   0               0        TRUE
#> 5     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                 FALSE
#> 2                    NA
#> 3                    NA
#> 4                    NA
#> 5                  TRUE

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.