The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
This vignette shows how to mirror the structure of real data with fully synthetic values, verify the structure, and produce an LLM-ready bundle.
# tiny input with a few likely sensitive fields
df <- data.frame(
id = sprintf("id%03d", 1:10),
email = paste0("a", 1:10, "@x.com"),
Progress = paste0(sample(80:100, 10, TRUE), "%"),
check.names = FALSE
)
orig <- prepare_input_data(df)
fake_priv <- generate_fake_with_privacy(
data = orig, n = 10, level = "low", seed = 1,
sensitive = c("id", "email"),
sensitive_detect = TRUE,
sensitive_strategy = "fake",
normalize = TRUE
)
# quick validation sample
head(validate_fake(orig, fake_priv), 5)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 id character integer FALSE 0 0
#> 2 email character character TRUE 0 0
#> 3 Progress numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE 0 NA NA
#> 2 TRUE 0 0 TRUE
#> 3 TRUE NA NA NA
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
library(FakeDataR)
# Basic fake from a data.frame
fake_mtc <- generate_fake_data(mtcars, n = 200, seed = 1)
validate_fake(mtcars, fake_mtc)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 mpg numeric numeric TRUE 0 0
#> 2 cyl numeric numeric TRUE 0 0
#> 3 disp numeric numeric TRUE 0 0
#> 4 hp numeric numeric TRUE 0 0
#> 5 drat numeric numeric TRUE 0 0
#> 6 wt numeric numeric TRUE 0 0
#> 7 qsec numeric numeric TRUE 0 0
#> 8 vs numeric numeric TRUE 0 0
#> 9 am numeric numeric TRUE 0 0
#> 10 gear numeric numeric TRUE 0 0
#> 11 carb numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE NA NA NA
#> 3 TRUE NA NA NA
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> 6 TRUE NA NA NA
#> 7 TRUE NA NA NA
#> 8 TRUE NA NA NA
#> 9 TRUE NA NA NA
#> 10 TRUE NA NA NA
#> 11 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 TRUE
#> 7 TRUE
#> 8 TRUE
#> 9 TRUE
#> 10 TRUE
#> 11 TRUE
fake_co2 <- generate_fake_data(as.data.frame(CO2), n = 200, seed = 2)
validate_fake(as.data.frame(CO2), fake_co2)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 Plant ordered/factor factor FALSE 0 0
#> 2 Type factor factor TRUE 0 0
#> 3 Treatment factor factor TRUE 0 0
#> 4 conc numeric numeric TRUE 0 0
#> 5 uptake numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE 0 0 TRUE
#> 2 TRUE 0 0 TRUE
#> 3 TRUE 0 0 TRUE
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 NA
#> 4 TRUE
#> 5 TRUE
fake_tg <- generate_fake_data(ToothGrowth, n = 120, seed = 3)
validate_fake(ToothGrowth, fake_tg)
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 len numeric numeric TRUE 0 0
#> 2 supp factor factor TRUE 0 0
#> 3 dose numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE NA NA NA
#> 2 TRUE 0 0 TRUE
#> 3 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 NA
#> 3 TRUE
df_date <- data.frame(d = seq(as.Date("2020-01-01"), by = "day", length.out = 50))
fake_date <- generate_fake_data(df_date, n = 80, seed = 4)
str(fake_date$d)
#> Date[1:80], format: "2020-01-30" "2020-01-01" "2020-01-15" "2020-01-15" "2020-02-10" ...
dt <- data.frame(
when = seq.POSIXt(as.POSIXct("2023-05-01 00:00:00", tz = "America/New_York"),
by = "hour", length.out = 200)
)
fake_dt <- generate_fake_data(dt, n = 50, seed = 5)
str(fake_dt$when)
#> POSIXct[1:50], format: "2023-05-02 15:50:33" "2023-05-06 16:21:30" "2023-05-08 14:27:29" ...
range(fake_dt$when)
#> [1] "2023-05-01 02:52:55 EDT" "2023-05-09 00:13:36 EDT"
These chunks run only if the packages are installed.
if (requireNamespace("nycflights13", quietly = TRUE)) {
fl <- nycflights13::flights
set.seed(10)
fl_small <- fl[sample.int(nrow(fl), 2000), ] # smaller
fake_fl <- generate_fake_data(
fl_small, n = 500, seed = 10,
numeric_mode = "distribution"
)
head(validate_fake(fl_small, fake_fl), 5)
} else {
message("nycflights13 not installed - skipping.")
}
#> column class_original class_fake class_match na_prop_original
#> 1 year integer integer TRUE 0.000
#> 2 month integer integer TRUE 0.000
#> 3 day integer integer TRUE 0.000
#> 4 dep_time integer integer TRUE 0.027
#> 5 sched_dep_time integer integer TRUE 0.000
#> na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1 0.00 TRUE NA NA NA
#> 2 0.00 TRUE NA NA NA
#> 3 0.00 TRUE NA NA NA
#> 4 0.02 TRUE NA NA NA
#> 5 0.00 TRUE NA NA NA
#> range_within_original
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
if (requireNamespace("palmerpenguins", quietly = TRUE)) {
peng <- na.omit(palmerpenguins::penguins[, c("species","island","bill_length_mm","sex")])
fake_peng <- generate_fake_data(
peng, n = 400, seed = 11,
category_mode = "preserve"
)
head(validate_fake(peng, fake_peng), 5)
} else {
message("palmerpenguins not installed - skipping.")
}
#> column class_original class_fake class_match na_prop_original
#> 1 species factor factor TRUE 0
#> 2 island factor factor TRUE 0
#> 3 bill_length_mm numeric numeric TRUE 0
#> 4 sex factor factor TRUE 0
#> na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1 0 TRUE 0 0 TRUE
#> 2 0 TRUE 0 0 TRUE
#> 3 0 TRUE NA NA NA
#> 4 0 TRUE 0 0 TRUE
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
#> 4 NA
# Optional package; make the chunk robust
if (requireNamespace("gapminder", quietly = TRUE)) {
set.seed(21)
gm <- gapminder::gapminder
# Keep it light if you want: gm <- gm[sample.int(nrow(gm), 2000), ]
fake_gm <- generate_fake_data(
gm, n = 800, seed = 21,
numeric_mode = "distribution", # nicer numeric spread
category_mode = "preserve" # keep factor levels
)
validate_fake(gm, fake_gm)
} else {
message("gapminder not installed; skipping demo.")
}
#> column class_original class_fake class_match na_prop_original na_prop_fake
#> 1 country factor factor TRUE 0 0
#> 2 continent factor factor TRUE 0 0
#> 3 year integer integer TRUE 0 0
#> 4 lifeExp numeric numeric TRUE 0 0
#> 5 pop integer integer TRUE 0 0
#> 6 gdpPercap numeric numeric TRUE 0 0
#> na_match blank_prop_original blank_prop_fake blank_match
#> 1 TRUE 0 0 TRUE
#> 2 TRUE 0 0 TRUE
#> 3 TRUE NA NA NA
#> 4 TRUE NA NA NA
#> 5 TRUE NA NA NA
#> 6 TRUE NA NA NA
#> range_within_original
#> 1 NA
#> 2 NA
#> 3 TRUE
#> 4 TRUE
#> 5 TRUE
#> 6 TRUE
set.seed(12)
df_pii <- data.frame(
id = 1:100,
email = sprintf("user%03d@corp.com", 1:100),
phone = sprintf("(415) 555-%04d", 1:100),
spend = runif(100, 10, 500)
)
fake_keep <- generate_fake_data(
df_pii, n = 120,
sensitive_detect = TRUE,
sensitive_strategy = "fake"
)
fake_drop <- generate_fake_data(
df_pii, n = 120,
sensitive_detect = TRUE,
sensitive_strategy = "drop"
)
names(fake_keep) # expect id/email/phone present but synthetic
#> [1] "id" "email" "phone" "spend"
names(fake_drop) # expect only "spend"
#> [1] "spend"
b1 <- llm_bundle(
data = ToothGrowth, n = 150, level = "high", seed = 10,
formats = c("csv","rds"),
path = tempdir(), filename = "toothgrowth_fake",
write_prompt = TRUE, zip = TRUE
)
b1$schema_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake_schema.json"
b1$readme_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/README_FOR_LLM.txt"
b1$zip_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake.zip"
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.