ACIC2016

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

ACIC2016


# function to simulate data
simulate_data <- function(truth, covariates, n, random.seed, sigma_y = 1) {

    # set seed
    random.seed <- list(seed = random.seed, kind = "Mersenne-Twister", normal.kind = "Inversion", sample.kind = "Rounding")
    suppressWarnings(do.call("set.seed", random.seed))
    
    # sample data n = 100, 500, 2500
    selected_sample <- sample(seq_along(truth$mu.0), n, replace = FALSE)
    mu.0 = truth$mu.0[selected_sample]
    mu.1 = truth$mu.1[selected_sample]
    x = covariates[selected_sample,]
    
    # assign treatment
    n1=round(n/2)
    n0=n-n1
    zind=sample(1:n,size=n1)
    z=numeric(n)
    z[zind]=1
    mu = z * mu.1 + (1-z) * mu.0
    y <- mu + sigma_y*rnorm(n)

    return(
        list(
            x = x,
            z = z,
            y = y,
            mu.1 = mu.1,
            mu.0 = mu.0
        )
    )
}

# function to get true GATE
trueGATE <- function(truth, tau, ngates = 5) {
  n = length(tau)
  fd_label = ntile(tau, ngates)
  gates = numeric(ngates)
  for (i in 1:ngates) {
    gates[i] = mean(truth$mu.1[fd_label == i] - truth$mu.0[fd_label == i])
  }
  return(gates)
}

# function to get true GATE via cross-fitting
compute_true_GATE <- function(truth, input_x, full_data, n_sample, nfolds, ngates, iter) {
  n_train = round(n_sample*(nfolds-1)/nfolds)
  train_data = simulate_data(truth, input_x, n_train, iter)

  cf_train = causal_forest(
    model.matrix(~.-1, data = train_data$x),
    train_data$y,
    train_data$z,
    num.trees = 10,
    num.threads = 1
  )

  tau_test = predict(cf_train, newdata = model.matrix(~.-1, data = full_data$x))$predictions
  gate_test = trueGATE(full_data, tau_test, ngates)

  return(tibble(
    iter = iter,
    group = 1:ngates,
    true_gate = gate_test  # FIX: Rename here to match your join logic
  ))
}

make_synth_acic_population <- function(n_pop = 5000, seed = 1,
                                      p_cont = 10, p_bin = 10,
                                      sigma_mu = 1) {
  set.seed(seed)

  # correlated continuous covariates
  rho <- 0.4
  Sigma <- rho ^ abs(outer(seq_len(p_cont), seq_len(p_cont), "-"))
  Xc <- matrix(rnorm(n_pop * p_cont), n_pop, p_cont) %*% chol(Sigma)
  colnames(Xc) <- paste0("x", seq_len(p_cont))

  # binary covariates correlated with continuous ones
  Xb <- matrix(0, n_pop, p_bin)
  colnames(Xb) <- paste0("b", seq_len(p_bin))
  for (j in seq_len(p_bin)) {
    lin <- 0.6 * Xc[, ((j - 1) %% p_cont) + 1] - 0.3 * Xc[, ((j) %% p_cont) + 1] + rnorm(n_pop, sd = 0.5)
    pr <- plogis(lin)
    Xb[, j] <- rbinom(n_pop, 1, pr)
  }

  # a couple categorical covariates (factors)
  cut3 <- cut(Xc[, 3], breaks = quantile(Xc[, 3], probs = c(0, 1/3, 2/3, 1)),
              include.lowest = TRUE, labels = c("A", "B", "C"))
  cut4 <- cut(Xc[, 4], breaks = quantile(Xc[, 4], probs = c(0, 0.25, 0.5, 0.75, 1)),
              include.lowest = TRUE, labels = c("L1", "L2", "L3", "L4"))

  x <- data.frame(
    Xc,
    Xb,
    cat1 = factor(cut3),
    cat2 = factor(cut4)
  )

  # define mu0(x) and heterogeneous tau(x) with nonlinearities and interactions
  mu0 <- 0.5 * Xc[, 1] -
    0.25 * (Xc[, 2]^2) +
    sin(Xc[, 3]) +
    0.6 * Xb[, 1] +
    0.4 * (cut3 == "B") -
    0.3 * (cut3 == "C") +
    0.5 * Xc[, 5] * Xb[, 2] +
    0.3 * pmax(Xc[, 6], 0)

  tau <- 0.2 +
    0.35 * Xc[, 1] +
    0.25 * (Xc[, 7] > 0) +
    0.2 * Xb[, 3] -
    0.25 * Xc[, 2] * Xb[, 1] +
    0.3 * (cut4 %in% c("L3", "L4")) +
    0.15 * cos(Xc[, 8])

  mu0 <- sigma_mu * mu0
  tau <- sigma_mu * tau
  mu1 <- mu0 + tau

  truth <- list(
    mu.0 = as.numeric(mu0),
    mu.1 = as.numeric(mu1)
  )

  list(x = x, truth = truth)
}

# function to compute GATE bounds with michael's original code
compute_GATE_bounds <- function(truth, input_x, n_sample, nfolds, ngates, n_iter) {

  # get training data
  full_data = simulate_data(truth, input_x, n_sample, n_iter)

  # create folds
  folds = caret::createFolds(full_data$y, k = nfolds)
  indcv = numeric(n_sample)
  tauCV = matrix(0, nrow = n_sample, ncol = nfolds)

  for (i in 1:nfolds) {
    x_full = model.matrix(~.-1, data = full_data$x)
    x_train = x_full[-folds[[i]], ]
    y_train = full_data$y[-folds[[i]]]
    z_train = full_data$z[-folds[[i]]]

    x_test = x_full[folds[[i]], ]
    y_test = full_data$y[folds[[i]]]
    z_test = full_data$z[folds[[i]]]

    # fit causal forest
    cf_train = causal_forest(
      x_train,
      y_train,
      z_train,
      num.trees = 20,
      num.threads = 1
    )

    # compute tau
    tauCV[, i] = predict(cf_train, x_full)$predictions 
    indcv[folds[[i]]] = rep(i, nrow(x_test))
  }
  
  # compute GATE
  output = GATEcv(full_data$z, tauCV, full_data$y, indcv, ngates = ngates)

  
  return(
    list(
      group = 1:ngates,
      gate = output$gate,
      sd = output$sd,
      upper = output$gate + 1.96 * output$sd,
      lower = output$gate - 1.96 * output$sd
    )
  )
}

# function to compute GATE bounds with evalHTE package
compute_GATE_bounds_pkg <- function(truth, input_x, n_sample, nfolds, ngates, n_iter, algs) {
  # get training data
  full_data = simulate_data(truth, input_x, n_sample, n_iter)

  # create data
  df = data.frame(
    full_data$x,
    y = full_data$y,
    z = full_data$z
  )

  # specify formula
  cov_names = input_x %>% names()
  user_formula <- paste0("y ~ z*(", paste(cov_names, collapse = " + "), ")")

  # run with evalITR package
  fit <- estimate_hte(
    treatment = "z",
    form = user_formula,
    data = df,
    algorithms = algs,
    n_folds = nfolds,
    meta_learner = "slearner")

  # evaluate HTE
  est <- evaluate_hte(fit)
  gate_pkg <- summary(est)$GATE %>% 
      mutate(iter = n_iter)

  return(gate_pkg)
  
}

# function to loop over n_sim for true GATE
run_simluation_gate <- function(truth, input_x, full_data, n_sample, nfolds, ngates, n_sim) {

  # loop over n_sim
  results = future_map(1:n_sim, ~compute_true_GATE(truth, input_x, full_data, n_sample, nfolds, ngates, .x),
    .options = furrr_options(seed = 123)
  ) %>% bind_rows()

  return(results)
}


# function to loop over n_sim for gate sd
run_simluation_gate_sd <- function(truth, input_x, n_sample, nfolds, ngates, n_sim, pkg = FALSE, algs = c("causal_forest")) {

    # if pkg == FALSE, compute GATE bounds using michael's original code
    if (pkg == FALSE) {
        results = future_map(1:n_sim, ~compute_GATE_bounds(truth, input_x, n_sample, nfolds, ngates, .x),
        .options = furrr_options(seed = 123)
  ) %>% bind_rows()

    } else {
        # if pkg == TRUE, compute GATE bounds using evalITR package
        results = future_map(1:n_sim, ~compute_GATE_bounds_pkg(truth, input_x, n_sample, nfolds, ngates, .x, algs = algs),
        .options = furrr_options(seed = 123)
  ) %>% bind_rows()

    }

  return(results)
}


Sys.setenv(
OMP_NUM_THREADS = 1,
OPENBLAS_NUM_THREADS = 1,
MKL_NUM_THREADS = 1,
VECLIB_MAXIMUM_THREADS = 1
)
library(evalHTE)
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library(future)
future::plan(future::sequential)
library(furrr)
library(magrittr)
# load ACIC 2016 data
library(grf)      # causal_forest
options(grf.num.threads = 1)
library(dplyr)    # ntile, left_join, mutate, summarise
library(tibble)
synth <- make_synth_acic_population(n_pop = 5000, seed = 28, p_cont = 10, p_bin = 10)

input_x <- synth$x
truth <- synth$truth

full_data <- simulate_data(truth, input_x, nrow(input_x), 2021)

# parameters
n_sample = 50
n_sim = 5
n_folds = 2
n_gates = 2

# compute true GATE
results_gate = run_simluation_gate(truth, input_x, full_data, n_sample, n_folds, n_gates, n_sim)

# compute GATE bounds
results_sd_pkg = run_simluation_gate_sd(truth, input_x, n_sample, n_folds, n_gates, n_sim, pkg = TRUE, algs = c("causal_forest"))
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...

# check coverage
results_sd_pkg %>%
  left_join(results_gate, by = c("iter", "group")) %>%
  mutate(
    upper = estimate + 1.96 * std.deviation,
    lower = estimate - 1.96 * std.deviation,
    coverage = ((upper > true_gate) & (lower < true_gate)) * 1
  ) %>%
  summarise(
    coverage = mean(coverage, na.rm = TRUE)
  )
#> # A tibble: 1 × 1
#>   coverage
#>      <dbl>
#> 1      0.9

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.