README

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Overview

opencesp provides tools to synthesize tabular datasets and to evaluate synthetic data from both utility and disclosure risk viewpoints. The emphasis is on the flexibility and accessibility of the methods proposed, hence most of these tools are nonparametric. The synthesis approaches that are targeted by the package are primarily those in which the synthetic distribution is specified in terms of conditional laws. This includes sequential as well as fully conditional specifications.

Installation

install.packages("opencesp")

Main functions

Example

The example below demonstrates how the package can be used to synthesize the iris dataset by sequential specification, and evaluate the risk-utility tradeoff provided by the synthesis.

library(opencesp)
set.seed(1234)

orig <- iris[sapply(iris, is.numeric)]

score_pgb <- function(x, y) {
  if(is.null(x)) {
    d <- density(y, bw = "nrd0")
    return(sum(log(approx(d$x, d$y, xout = y, rule = 2)$y)))
  }
  fit <- pgb_cvh(y ~ ., data = cbind.data.frame(y, x), subsample = 1, select_h = "none")
  cde <- predict_cde_pgb_raw(fit, x, y)
  sum(log(diag(cde)))
}

ord <- dep_order(orig, score_func = score_pgb)
vars <- names(orig)[ord]

tree_grid <- c(25, 150, 300, 600)

# Fit one complete factorization of the joint distribution for each tree budget.
fits_by_tree <- lapply(tree_grid, function(nt) {
  fits <- vector("list", length(vars))
  names(fits) <- vars
  
  fits[[1]] <- density(orig[[vars[1]]], bw = "nrd0")
  
  for(j in seq_along(vars)[-1]) {
    formula_j <- reformulate(vars[seq_len(j - 1)], response = vars[j])
    
    fits[[j]] <- pgb(
      formula_j,
      data = orig[, vars, drop = FALSE],
      pvalid = 0,
      ntrees = nt,
      subsample = 1
    )
  }
  
  fits
})
names(fits_by_tree) <- tree_grid

nsynth <- 10

synth_list <- lapply(fits_by_tree, function(fits) {
  replicate(nsynth, simplify = FALSE, expr = {
    syn <- orig[rep(1, nrow(orig)), vars, drop = FALSE]
    
    # Marginal draw from the reference-bandwidth kernel density estimate.
    syn[[1]] <- sample(orig[[vars[1]]], nrow(orig), replace = TRUE) + rnorm(nrow(orig), sd = fits[[1]]$bw)
    
    for(j in seq_along(vars)[-1]) {
      # Raw conditional draw: choose a quantile interval, then sample uniformly in it.
      qhat <- predict(fits[[j]], syn[, vars[seq_len(j - 1)], drop = FALSE])
      seg <- sample.int(ncol(qhat) - 1, nrow(syn), replace = TRUE, prob = diff(fits[[j]]$targets))
      
      lo <- qhat[cbind(seq_len(nrow(syn)), seg)]
      hi <- qhat[cbind(seq_len(nrow(syn)), seg + 1)]
      
      syn[[j]] <- ifelse(hi > lo, runif(nrow(syn), lo, hi), lo)
    }
    
    round_synth(orig[, vars, drop = FALSE], syn)[, names(orig), drop = FALSE]
  })
})

ru <- data.frame(
  ntrees = tree_grid,
  utility = sapply(synth_list, function(synth_l) {
    mean(sapply(synth_l, function(synth) 1 - 2 * abs(tsAUC(orig, synth)$auc-0.5)))
  }),
  risk = sapply(synth_list, function(synth_l) {
    mean(sapply(synth_l, function(synth) adaptive_matches_prop(orig, synth)))
  })
)

# Plot the risk-utility map for the ts-AUC and the adaptive proportion of matches
plot(ru$utility, ru$risk, pch = 19, xlab = "Utility", ylab = "Risk")

text(ru$utility, ru$risk, labels = ru$ntrees, pos = 2)
lines(ru$utility, ru$risk, lty = 2, col = "grey70")

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.