The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
# function to simulate data
simulate_data <- function(truth, covariates, n, random.seed, sigma_y = 1) {
# set seed
random.seed <- list(seed = random.seed, kind = "Mersenne-Twister", normal.kind = "Inversion", sample.kind = "Rounding")
suppressWarnings(do.call("set.seed", random.seed))
# sample data n = 100, 500, 2500
selected_sample <- sample(seq_along(truth$mu.0), n, replace = FALSE)
mu.0 = truth$mu.0[selected_sample]
mu.1 = truth$mu.1[selected_sample]
x = covariates[selected_sample,]
# assign treatment
n1=round(n/2)
n0=n-n1
zind=sample(1:n,size=n1)
z=numeric(n)
z[zind]=1
mu = z * mu.1 + (1-z) * mu.0
y <- mu + sigma_y*rnorm(n)
return(
list(
x = x,
z = z,
y = y,
mu.1 = mu.1,
mu.0 = mu.0
)
)
}
# function to get true GATE
trueGATE <- function(truth, tau, ngates = 5) {
n = length(tau)
fd_label = ntile(tau, ngates)
gates = numeric(ngates)
for (i in 1:ngates) {
gates[i] = mean(truth$mu.1[fd_label == i] - truth$mu.0[fd_label == i])
}
return(gates)
}
# function to get true GATE via cross-fitting
compute_true_GATE <- function(truth, input_x, full_data, n_sample, nfolds, ngates, iter) {
n_train = round(n_sample*(nfolds-1)/nfolds)
train_data = simulate_data(truth, input_x, n_train, iter)
cf_train = causal_forest(
model.matrix(~.-1, data = train_data$x),
train_data$y,
train_data$z,
num.trees = 10,
num.threads = 1
)
tau_test = predict(cf_train, newdata = model.matrix(~.-1, data = full_data$x))$predictions
gate_test = trueGATE(full_data, tau_test, ngates)
return(tibble(
iter = iter,
group = 1:ngates,
true_gate = gate_test # FIX: Rename here to match your join logic
))
}
make_synth_acic_population <- function(n_pop = 5000, seed = 1,
p_cont = 10, p_bin = 10,
sigma_mu = 1) {
set.seed(seed)
# correlated continuous covariates
rho <- 0.4
Sigma <- rho ^ abs(outer(seq_len(p_cont), seq_len(p_cont), "-"))
Xc <- matrix(rnorm(n_pop * p_cont), n_pop, p_cont) %*% chol(Sigma)
colnames(Xc) <- paste0("x", seq_len(p_cont))
# binary covariates correlated with continuous ones
Xb <- matrix(0, n_pop, p_bin)
colnames(Xb) <- paste0("b", seq_len(p_bin))
for (j in seq_len(p_bin)) {
lin <- 0.6 * Xc[, ((j - 1) %% p_cont) + 1] - 0.3 * Xc[, ((j) %% p_cont) + 1] + rnorm(n_pop, sd = 0.5)
pr <- plogis(lin)
Xb[, j] <- rbinom(n_pop, 1, pr)
}
# a couple categorical covariates (factors)
cut3 <- cut(Xc[, 3], breaks = quantile(Xc[, 3], probs = c(0, 1/3, 2/3, 1)),
include.lowest = TRUE, labels = c("A", "B", "C"))
cut4 <- cut(Xc[, 4], breaks = quantile(Xc[, 4], probs = c(0, 0.25, 0.5, 0.75, 1)),
include.lowest = TRUE, labels = c("L1", "L2", "L3", "L4"))
x <- data.frame(
Xc,
Xb,
cat1 = factor(cut3),
cat2 = factor(cut4)
)
# define mu0(x) and heterogeneous tau(x) with nonlinearities and interactions
mu0 <- 0.5 * Xc[, 1] -
0.25 * (Xc[, 2]^2) +
sin(Xc[, 3]) +
0.6 * Xb[, 1] +
0.4 * (cut3 == "B") -
0.3 * (cut3 == "C") +
0.5 * Xc[, 5] * Xb[, 2] +
0.3 * pmax(Xc[, 6], 0)
tau <- 0.2 +
0.35 * Xc[, 1] +
0.25 * (Xc[, 7] > 0) +
0.2 * Xb[, 3] -
0.25 * Xc[, 2] * Xb[, 1] +
0.3 * (cut4 %in% c("L3", "L4")) +
0.15 * cos(Xc[, 8])
mu0 <- sigma_mu * mu0
tau <- sigma_mu * tau
mu1 <- mu0 + tau
truth <- list(
mu.0 = as.numeric(mu0),
mu.1 = as.numeric(mu1)
)
list(x = x, truth = truth)
}
# function to compute GATE bounds with michael's original code
compute_GATE_bounds <- function(truth, input_x, n_sample, nfolds, ngates, n_iter) {
# get training data
full_data = simulate_data(truth, input_x, n_sample, n_iter)
# create folds
folds = caret::createFolds(full_data$y, k = nfolds)
indcv = numeric(n_sample)
tauCV = matrix(0, nrow = n_sample, ncol = nfolds)
for (i in 1:nfolds) {
x_full = model.matrix(~.-1, data = full_data$x)
x_train = x_full[-folds[[i]], ]
y_train = full_data$y[-folds[[i]]]
z_train = full_data$z[-folds[[i]]]
x_test = x_full[folds[[i]], ]
y_test = full_data$y[folds[[i]]]
z_test = full_data$z[folds[[i]]]
# fit causal forest
cf_train = causal_forest(
x_train,
y_train,
z_train,
num.trees = 20,
num.threads = 1
)
# compute tau
tauCV[, i] = predict(cf_train, x_full)$predictions
indcv[folds[[i]]] = rep(i, nrow(x_test))
}
# compute GATE
output = GATEcv(full_data$z, tauCV, full_data$y, indcv, ngates = ngates)
return(
list(
group = 1:ngates,
gate = output$gate,
sd = output$sd,
upper = output$gate + 1.96 * output$sd,
lower = output$gate - 1.96 * output$sd
)
)
}
# function to compute GATE bounds with evalHTE package
compute_GATE_bounds_pkg <- function(truth, input_x, n_sample, nfolds, ngates, n_iter, algs) {
# get training data
full_data = simulate_data(truth, input_x, n_sample, n_iter)
# create data
df = data.frame(
full_data$x,
y = full_data$y,
z = full_data$z
)
# specify formula
cov_names = input_x %>% names()
user_formula <- paste0("y ~ z*(", paste(cov_names, collapse = " + "), ")")
# run with evalITR package
fit <- estimate_hte(
treatment = "z",
form = user_formula,
data = df,
algorithms = algs,
n_folds = nfolds,
meta_learner = "slearner")
# evaluate HTE
est <- evaluate_hte(fit)
gate_pkg <- summary(est)$GATE %>%
mutate(iter = n_iter)
return(gate_pkg)
}
# function to loop over n_sim for true GATE
run_simluation_gate <- function(truth, input_x, full_data, n_sample, nfolds, ngates, n_sim) {
# loop over n_sim
results = future_map(1:n_sim, ~compute_true_GATE(truth, input_x, full_data, n_sample, nfolds, ngates, .x),
.options = furrr_options(seed = 123)
) %>% bind_rows()
return(results)
}
# function to loop over n_sim for gate sd
run_simluation_gate_sd <- function(truth, input_x, n_sample, nfolds, ngates, n_sim, pkg = FALSE, algs = c("causal_forest")) {
# if pkg == FALSE, compute GATE bounds using michael's original code
if (pkg == FALSE) {
results = future_map(1:n_sim, ~compute_GATE_bounds(truth, input_x, n_sample, nfolds, ngates, .x),
.options = furrr_options(seed = 123)
) %>% bind_rows()
} else {
# if pkg == TRUE, compute GATE bounds using evalITR package
results = future_map(1:n_sim, ~compute_GATE_bounds_pkg(truth, input_x, n_sample, nfolds, ngates, .x, algs = algs),
.options = furrr_options(seed = 123)
) %>% bind_rows()
}
return(results)
}
Sys.setenv(
OMP_NUM_THREADS = 1,
OPENBLAS_NUM_THREADS = 1,
MKL_NUM_THREADS = 1,
VECLIB_MAXIMUM_THREADS = 1
)
library(evalHTE)
#> Loading required package: dplyr
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(future)
future::plan(future::sequential)
library(furrr)
library(magrittr)
# load ACIC 2016 data
library(grf) # causal_forest
options(grf.num.threads = 1)
library(dplyr) # ntile, left_join, mutate, summarise
library(tibble)
synth <- make_synth_acic_population(n_pop = 5000, seed = 28, p_cont = 10, p_bin = 10)
input_x <- synth$x
truth <- synth$truth
full_data <- simulate_data(truth, input_x, nrow(input_x), 2021)
# parameters
n_sample = 50
n_sim = 5
n_folds = 2
n_gates = 2
# compute true GATE
results_gate = run_simluation_gate(truth, input_x, full_data, n_sample, n_folds, n_gates, n_sim)
# compute GATE bounds
results_sd_pkg = run_simluation_gate_sd(truth, input_x, n_sample, n_folds, n_gates, n_sim, pkg = TRUE, algs = c("causal_forest"))
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
#> Evaluate ITR with cross-validation ...
# check coverage
results_sd_pkg %>%
left_join(results_gate, by = c("iter", "group")) %>%
mutate(
upper = estimate + 1.96 * std.deviation,
lower = estimate - 1.96 * std.deviation,
coverage = ((upper > true_gate) & (lower < true_gate)) * 1
) %>%
summarise(
coverage = mean(coverage, na.rm = TRUE)
)
#> # A tibble: 1 × 1
#> coverage
#> <dbl>
#> 1 0.9These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.