FakeDataR: Getting started

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

This vignette shows how to mirror the structure of real data with fully synthetic values, verify the structure, and produce an LLM-ready bundle.

Quick start

# tiny input with a few likely sensitive fields
df <- data.frame(
  id = sprintf("id%03d", 1:10),
  email = paste0("a", 1:10, "@x.com"),
  Progress = paste0(sample(80:100, 10, TRUE), "%"),
  check.names = FALSE
)

orig <- prepare_input_data(df)

fake_priv <- generate_fake_with_privacy(
  data = orig, n = 10, level = "low", seed = 1,
  sensitive = c("id", "email"),
  sensitive_detect = TRUE,
  sensitive_strategy = "fake",
  normalize = TRUE
)

# quick validation sample
head(validate_fake(orig, fake_priv), 5)
#>     column class_original class_fake class_match na_prop_original na_prop_fake
#> 1       id      character    integer       FALSE                0            0
#> 2    email      character  character        TRUE                0            0
#> 3 Progress        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                   0              NA          NA
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                  TRUE

library(FakeDataR)

# Basic fake from a data.frame
fake_mtc <- generate_fake_data(mtcars, n = 200, seed = 1)
validate_fake(mtcars, fake_mtc)
#>    column class_original class_fake class_match na_prop_original na_prop_fake
#> 1     mpg        numeric    numeric        TRUE                0            0
#> 2     cyl        numeric    numeric        TRUE                0            0
#> 3    disp        numeric    numeric        TRUE                0            0
#> 4      hp        numeric    numeric        TRUE                0            0
#> 5    drat        numeric    numeric        TRUE                0            0
#> 6      wt        numeric    numeric        TRUE                0            0
#> 7    qsec        numeric    numeric        TRUE                0            0
#> 8      vs        numeric    numeric        TRUE                0            0
#> 9      am        numeric    numeric        TRUE                0            0
#> 10   gear        numeric    numeric        TRUE                0            0
#> 11   carb        numeric    numeric        TRUE                0            0
#>    na_match blank_prop_original blank_prop_fake blank_match
#> 1      TRUE                  NA              NA          NA
#> 2      TRUE                  NA              NA          NA
#> 3      TRUE                  NA              NA          NA
#> 4      TRUE                  NA              NA          NA
#> 5      TRUE                  NA              NA          NA
#> 6      TRUE                  NA              NA          NA
#> 7      TRUE                  NA              NA          NA
#> 8      TRUE                  NA              NA          NA
#> 9      TRUE                  NA              NA          NA
#> 10     TRUE                  NA              NA          NA
#> 11     TRUE                  NA              NA          NA
#>    range_within_original
#> 1                   TRUE
#> 2                   TRUE
#> 3                   TRUE
#> 4                   TRUE
#> 5                   TRUE
#> 6                   TRUE
#> 7                   TRUE
#> 8                   TRUE
#> 9                   TRUE
#> 10                  TRUE
#> 11                  TRUE

Factors, characters, and numerics

fake_co2 <- generate_fake_data(as.data.frame(CO2), n = 200, seed = 2)
validate_fake(as.data.frame(CO2), fake_co2)
#>      column class_original class_fake class_match na_prop_original na_prop_fake
#> 1     Plant ordered/factor     factor       FALSE                0            0
#> 2      Type         factor     factor        TRUE                0            0
#> 3 Treatment         factor     factor        TRUE                0            0
#> 4      conc        numeric    numeric        TRUE                0            0
#> 5    uptake        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                   0               0        TRUE
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                   0               0        TRUE
#> 4     TRUE                  NA              NA          NA
#> 5     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                    NA
#> 4                  TRUE
#> 5                  TRUE

fake_tg <- generate_fake_data(ToothGrowth, n = 120, seed = 3)
validate_fake(ToothGrowth, fake_tg)
#>   column class_original class_fake class_match na_prop_original na_prop_fake
#> 1    len        numeric    numeric        TRUE                0            0
#> 2   supp         factor     factor        TRUE                0            0
#> 3   dose        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                  NA              NA          NA
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                  TRUE
#> 2                    NA
#> 3                  TRUE

Dates and POSIXct (time zones preserved)

df_date <- data.frame(d = seq(as.Date("2020-01-01"), by = "day", length.out = 50))
fake_date <- generate_fake_data(df_date, n = 80, seed = 4)
str(fake_date$d)
#>  Date[1:80], format: "2020-01-30" "2020-01-01" "2020-01-15" "2020-01-15" "2020-02-10" ...

dt <- data.frame(
  when = seq.POSIXt(as.POSIXct("2023-05-01 00:00:00", tz = "America/New_York"),
                    by = "hour", length.out = 200)
)
fake_dt <- generate_fake_data(dt, n = 50, seed = 5)
str(fake_dt$when)
#>  POSIXct[1:50], format: "2023-05-02 15:50:33" "2023-05-06 16:21:30" "2023-05-08 14:27:29" ...
range(fake_dt$when)
#> [1] "2023-05-01 02:52:55 EDT" "2023-05-09 00:13:36 EDT"

Public datasets - wrap in guards, trim sizes

These chunks run only if the packages are installed.


if (requireNamespace("nycflights13", quietly = TRUE)) {
  fl <- nycflights13::flights
  set.seed(10)
  fl_small <- fl[sample.int(nrow(fl), 2000), ]  # smaller
  fake_fl <- generate_fake_data(
    fl_small, n = 500, seed = 10,
    numeric_mode = "distribution"
  )
  head(validate_fake(fl_small, fake_fl), 5)
} else {
  message("nycflights13 not installed - skipping.")
}
#>           column class_original class_fake class_match na_prop_original
#> 1           year        integer    integer        TRUE            0.000
#> 2          month        integer    integer        TRUE            0.000
#> 3            day        integer    integer        TRUE            0.000
#> 4       dep_time        integer    integer        TRUE            0.027
#> 5 sched_dep_time        integer    integer        TRUE            0.000
#>   na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1         0.00     TRUE                  NA              NA          NA
#> 2         0.00     TRUE                  NA              NA          NA
#> 3         0.00     TRUE                  NA              NA          NA
#> 4         0.02     TRUE                  NA              NA          NA
#> 5         0.00     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                  TRUE
#> 2                  TRUE
#> 3                  TRUE
#> 4                  TRUE
#> 5                  TRUE

if (requireNamespace("palmerpenguins", quietly = TRUE)) {
  peng <- na.omit(palmerpenguins::penguins[, c("species","island","bill_length_mm","sex")])
  fake_peng <- generate_fake_data(
    peng, n = 400, seed = 11,
    category_mode = "preserve"
  )
  head(validate_fake(peng, fake_peng), 5)
} else {
  message("palmerpenguins not installed - skipping.")
}
#>           column class_original class_fake class_match na_prop_original
#> 1        species         factor     factor        TRUE                0
#> 2         island         factor     factor        TRUE                0
#> 3 bill_length_mm        numeric    numeric        TRUE                0
#> 4            sex         factor     factor        TRUE                0
#>   na_prop_fake na_match blank_prop_original blank_prop_fake blank_match
#> 1            0     TRUE                   0               0        TRUE
#> 2            0     TRUE                   0               0        TRUE
#> 3            0     TRUE                  NA              NA          NA
#> 4            0     TRUE                   0               0        TRUE
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                  TRUE
#> 4                    NA

Gapminder demo

# Optional package; make the chunk robust
if (requireNamespace("gapminder", quietly = TRUE)) {
  set.seed(21)
  gm <- gapminder::gapminder
  # Keep it light if you want: gm <- gm[sample.int(nrow(gm), 2000), ]

  fake_gm <- generate_fake_data(
    gm, n = 800, seed = 21,
    numeric_mode = "distribution",  # nicer numeric spread
    category_mode = "preserve"      # keep factor levels
  )

  validate_fake(gm, fake_gm)
} else {
  message("gapminder not installed; skipping demo.")
}
#>      column class_original class_fake class_match na_prop_original na_prop_fake
#> 1   country         factor     factor        TRUE                0            0
#> 2 continent         factor     factor        TRUE                0            0
#> 3      year        integer    integer        TRUE                0            0
#> 4   lifeExp        numeric    numeric        TRUE                0            0
#> 5       pop        integer    integer        TRUE                0            0
#> 6 gdpPercap        numeric    numeric        TRUE                0            0
#>   na_match blank_prop_original blank_prop_fake blank_match
#> 1     TRUE                   0               0        TRUE
#> 2     TRUE                   0               0        TRUE
#> 3     TRUE                  NA              NA          NA
#> 4     TRUE                  NA              NA          NA
#> 5     TRUE                  NA              NA          NA
#> 6     TRUE                  NA              NA          NA
#>   range_within_original
#> 1                    NA
#> 2                    NA
#> 3                  TRUE
#> 4                  TRUE
#> 5                  TRUE
#> 6                  TRUE

Sensitive columns: fake vs drop

set.seed(12)
df_pii <- data.frame(
  id    = 1:100,
  email = sprintf("user%03d@corp.com", 1:100),
  phone = sprintf("(415) 555-%04d", 1:100),
  spend = runif(100, 10, 500)
)

fake_keep <- generate_fake_data(
  df_pii, n = 120,
  sensitive_detect   = TRUE,
  sensitive_strategy = "fake"
)
fake_drop <- generate_fake_data(
  df_pii, n = 120,
  sensitive_detect   = TRUE,
  sensitive_strategy = "drop"
)

names(fake_keep)        # expect id/email/phone present but synthetic
#> [1] "id"    "email" "phone" "spend"
names(fake_drop)        # expect only "spend"
#> [1] "spend"

LLM bundle: data + schema + README (+ optional ZIP)

b1 <- llm_bundle(
  data = ToothGrowth, n = 150, level = "high", seed = 10,
  formats = c("csv","rds"),
  path = tempdir(), filename = "toothgrowth_fake",
  write_prompt = TRUE, zip = TRUE
)
b1$schema_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake_schema.json"
b1$readme_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/README_FOR_LLM.txt"
b1$zip_path
#> [1] "C:\\Users\\ZOBAER~1\\AppData\\Local\\Temp\\RtmpkfKgRn/toothgrowth_fake.zip"

Parquet export (optional)

if (requireNamespace("arrow", quietly = TRUE)) {
  fake_air <- generate_fake_data(airquality, n = 400, seed = 20)
  export_fake(fake_air, file.path(tempdir(), "air.parquet"))
} else {
  message("arrow not installed - skipping Parquet export.")
}

Reproducibility

a1 <- generate_fake_data(CO2, n = 123, seed = 42)
a2 <- generate_fake_data(CO2, n = 123, seed = 42)
identical(a1, a2)
#> [1] TRUE

big <- data.frame(
  a = runif(2e5),
  b = sample(letters, 2e5, TRUE),
  c = as.Date("2020-01-01") + sample.int(3000, 2e5, TRUE)
)
system.time({
  fake_big <- generate_fake_data(big, n = 2e5, seed = 99)
})

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.