The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
generate_fake_with_privacy()
creates a synthetic copy of
your data.
It then handles sensitive columns by name.
level | category_mode | column_mode | numeric_mode |
---|---|---|---|
low | preserve | keep | range |
medium | generic | generic | range |
high | generic | generic | distribution |
sensitive_detect
auto-finds common PII by column
name.sensitive_strategy
chooses how to treat those columns:
"fake"
(tokenize) or "drop"
(remove).sensitive = c("id","email", ...)
.library(FakeDataR)
df <- data.frame(
id = 1:50,
email = sprintf("u%02d@x.com", 1:50),
phone = sprintf("555-01%02d", 1:50),
dept = sample(c("A","B","C"), 50, TRUE),
spend = round(runif(50, 10, 200), 2),
check.names = FALSE
)
# Auto-detect sensitive columns and fake them
# Strategy: fake sensitive fields (default)
fake_low <- generate_fake_with_privacy(
data = df, n = 60, level = "low", seed = 1,
sensitive_detect = TRUE, sensitive_strategy = "fake",
normalize = TRUE
)
# Auto-detect and drop sensitive columns
# Strategy: drop sensitive fields
fake_drop <- generate_fake_with_privacy(
data = df, n = 60, level = "medium", seed = 1,
sensitive_detect = TRUE, sensitive_strategy = "drop",
normalize = TRUE
)
names(fake_low)
#> [1] "id" "email" "phone" "dept" "spend"
names(fake_drop)
#> [1] "var4" "var5"
# Inspect privacy metadata
attr(fake_low, "sensitive_columns")
#> [1] "id" "email" "phone"
attr(fake_drop, "dropped_columns")
#> [1] "id" "email" "phone"
attr(fake_low, "name_map")
#> id email phone dept spend
#> "id" "email" "phone" "dept" "spend"
You can fully control what’s sensitive. Here we turn off auto-detect and list columns ourselves:
fake_explicit <- generate_fake_with_privacy(
data = df, n = 60, seed = 1,
sensitive = c("id","email","phone"),
sensitive_detect = FALSE,
sensitive_strategy = "fake",
normalize = TRUE
)
names(fake_explicit)
#> [1] "id" "email" "phone" "dept" "spend"
attr(fake_explicit, "sensitive_columns")
#> [1] "id" "email" "phone"
# A broad, configurable pattern set
sensitive_patterns <- c(
# direct IDs / names
"^id$", "employee[_-]?id", "user(name|[_-]?id)?$", "full[_-]?name", "first[_-]?name", "last[_-]?name",
# contact
"email|e-mail", "phone|tel|mobile", "fax",
# address / geo
"address|street|road|avenue|apt|unit|suite|zip|postal|postcode|city|state|province|country",
"lat(itude)?|lon(gitude)?|gps",
# government IDs (international sampling)
"RegId|ssn|sin|nin|aadhaar|aadhar|bvn|curp|dni|ced(ul|)+a|cpf|pan\\b|tin\\b|ein\\b|pesel|nin\\b",
# licenses / travel docs
"passport|visa|license|licence|driver|dl\\b|vin|plate",
# finance / payments
"iban|swift|bic|routing|sort[_-]?code|account|acct|bank",
"credit|debit|card|cvv|cvc|pan[_-]?number",
# auth / secrets / device
"password|pass|pwd|pin|otp|secret|token|api[_-]?key|auth|bearer|session|cookie",
"ip(_address)?|mac(_address)?|imei|imsi|serial|device|udid|android[_-]?id|idfa|gaid",
# medical / patient
"mrn|nhs|medicare|medicaid|patient|diagnosis",
# birthdays
"dob|date[_-]?of[_-]?birth|birth(day|date)",
# education
"student[_-]?id"
)
rx <- paste0("(?i)(", paste(sensitive_patterns, collapse = "|"), ")")
sens_cols <- names(df)[grepl(rx, names(df))]
sens_cols
#> [1] "id" "email" "phone"
sens_cols <- names(df)[grepl(rx, names(df))]
fake_custom_detect <- generate_fake_with_privacy(
data = df, n = 60, seed = 1,
sensitive = unique(c(sens_cols, "email")),
sensitive_detect = FALSE,
sensitive_strategy = "fake",
normalize = TRUE
)
attr(fake_custom_detect, "sensitive_columns")
#> [1] "id" "email" "phone"
v1 <- validate_fake(df, fake_low)
head(v1, 5)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 id integer integer TRUE 0 0
#> 2 email character character TRUE 0 0
#> 3 phone character character TRUE 0 0
#> 4 dept character character TRUE 0 0
#> 5 spend numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE 0 0 TRUE
#> 3 TRUE 0 0 TRUE
#> 4 TRUE 0 0 TRUE
#> 5 TRUE NA NA NA
#> range_within_original
#> 1 FALSE
#> 2 NA
#> 3 NA
#> 4 NA
#> 5 TRUE
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.