The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
geex
library(geex)
library(inferference)
library(dplyr)
TODO: describe what’s going on here
<- 1000
n <- data_frame(
x A = rbinom(n, 1, .2),
Y0 = rnorm(n, 0, 1),
Y1 = rnorm(n, 2 * A, 1),
Y = (A*Y1) + (1 - A)*Y0)
<- function(data){
ipw_estFUN <- data$A
A <- data$Y
Y function(theta, phat){
<- 1/theta[1]
ipw0 <- 1/theta[2]
ipw1
# Estimating functions #
c( (1 - A) - theta[1],
- theta[2],
A
# Estimating IP weight
*(1 - A)*ipw0 - theta[3],
Y*(A)*ipw1 - theta[4],
Y
# Treating IP weight as known
*A/phat - theta[5]
Y
)
} }
<- mean(x$A)
phat <- m_estimate(ipw_estFUN,
out data = x,
inner_args = list(phat = phat),
root_control = setup_root_control(start = c(.5, .5, 0, 0, 0)))
## Comparing point estimates
all.equal(mean(x$Y * x$A/phat), coef(out)[4])
## [1] TRUE
all.equal(phat, coef(out)[2])
## [1] TRUE
## Comparing variance estimates
<- diag(vcov(out)) * n
geex_vcov
# estimates match treating propensity as known
all.equal(var(x$Y * x$A/phat) * (n - 1)/n, geex_vcov[5])
## [1] TRUE
# estimates match using influence function approach
<- x$Y * x$A/phat - mean(x$Y * x$A/phat)
y <- (x$A - phat) / (phat*(1 - phat))
z var(y - predict(lm(y ~ z))) - geex_vcov[4] # close
## [1] 0.005655136
An example \(\psi\) function written
in R
. This function computes the score functions for a GLM,
plus two counterfactual means estimated by inverse probability
weighting.
<- function(data, model, alpha){
eefun <- model.matrix(model, data = data)
X <- model.response(model.frame(model, data = data))
A <- data$Y
Y
function(theta){
<- length(theta)
p <- length(coef(model))
p1 <- X %*% theta[1:p1]
lp <- plogis(lp)
rho
<- ((rho/alpha)^A * ((1-rho)/(1-alpha))^(1 - A))
hh <- 1/(exp(sum(log(hh))))
IPW
<- apply(X, 2, function(x) sum((A - rho) * x))
score_eqns <- mean(Y * (A == 0)) * IPW / (1 - alpha)
ce0 <- mean(Y * (A == 1)) * IPW / (alpha)
ce1
c(score_eqns,
- theta[p - 1],
ce0 - theta[p])
ce1
} }
Compare to what inferference
gets.
<- interference(
test formula = Y | A ~ X1 | group,
data = vaccinesim,
model_method = 'glm',
allocations = c(.35, .4))
<- glm(A ~ X1, data = vaccinesim, family = binomial)
mglm
<- m_estimate(
ce_estimates estFUN = eefun,
data = vaccinesim,
units = 'group',
root_control = setup_root_control(start = c(coef(mglm), .4, .13)),
outer_args = list(alpha = .35, model = mglm)
)
roots(ce_estimates)
## (Intercept) X1
## -0.36869683 -0.02037916 0.42186669 0.15507946
# Compare parameter estimates
direct_effect(test, allocation = .35)$estimate
## [1] 0.2667872
roots(ce_estimates)[3] - roots(ce_estimates)[4]
##
## 0.2667872
# conpare SE estimates
<- c(0, 0, 1, -1)
L <- vcov(ce_estimates)
Sigma sqrt(t(L) %*% Sigma %*% L) # from GEEX
## [,1]
## [1,] 0.03716025
direct_effect(test, allocation = .35)$std.error # from inferference
## [1] 0.02602316
I would expect them to be somewhat different, since
inferference
uses a slightly different variance estimator
defined in the web
appendix of Perez-Heydrich et al (2014).
Estimators of causal effects often have the form:
\[\begin{equation} \label{eq:causal} \sum_{i = 1}^m \psi(O_i, \theta) = \sum_{i = 1}^m \begin{pmatrix} \psi(O_i, \nu) \\ \psi(O_i, \beta) \end{pmatrix} = 0, \end{equation}\]
where \(\nu\) are parameters in
nuisance model(s), such as a propensity score model, and \(\beta\) are the target causal parameters.
Even when \(\nu\) represent parameters
in common statistical models, deriving a closed form for a sandwich
variance estimator for \(\beta\) based
on Equation~\(\ref{eq:causal}\) may
involve tedious and error-prone derivative and matrix calculations
[e.g.; see the appendices of Lunceford and
Davidian (2004) and Perez-Heydrich et al.
(2014)]. In this example, we show how an analyst can avoid these
calculations and compute the empirical sandwich variance estimator using
geex
.
Lunceford and Davidian (2004) review several estimators of causal effects from observational data. To demonstrate a more complicated estimator involving multiple nuisance models, we implement the doubly robust estimator:
\[\begin{equation} \label{eq:dbr} \hat{\Delta}_{DR} = \sum_{i = 1}^m \frac{Z_iY_i - (Z_i - \hat{e}_i) m_1(X_i, \hat{\alpha}_1)}{\hat{e}_i} - \frac{(1 - Z_i)Y_i - (Z_i - \hat{e}_i) m_0(X_i, \hat{\alpha}_0)}{1 - \hat{e}_i}. \end{equation}\]
This estimator targets the average causal effect, \(\Delta = \E[Y(1) - Y(0)]\), where \(Y(z)\) is the potential outcome for an experimental unit had it been exposed to the level \(z\) of the binary exposure variable \(Z\). The estimated propsensity score, \(\hat{e}_i\), is the estimated probability that unit \(i\) received \(z = 1\) and \(m_z(X_i, \hat{\alpha}_z)\) are regression models for the outcome with baseline covariates \(X_i\) and estimated paramaters \(\hat{\alpha}_z\). This estimator has the property that if either the propensity score models or the outcome models are correctly specified, then the solution to Equation~\(\ref{eq:dbr}\) will be a consistent and asymptotically Normal estimator of \(\Delta\).
This estimator and its estimating equations can be translated into an
estFUN
as:
<- function(data, models){
dr_estFUN
<- data$Z
Z <- data$Y
Y
<- grab_design_matrix(
Xe rhs_formula = grab_fixed_formula(models$e))
data, <- grab_design_matrix(
Xm0 rhs_formula = grab_fixed_formula(models$m0))
data, <- grab_design_matrix(
Xm1 rhs_formula = grab_fixed_formula(models$m1))
data,
<- 1:ncol(Xe)
e_pos <- (max(e_pos) + 1):(max(e_pos) + ncol(Xm0))
m0_pos <- (max(m0_pos) + 1):(max(m0_pos) + ncol(Xm1))
m1_pos
<- grab_psiFUN(models$e, data)
e_scores <- grab_psiFUN(models$m0, data)
m0_scores <- grab_psiFUN(models$m1, data)
m1_scores
function(theta){
<- plogis(Xe %*% theta[e_pos])
e <- Xm0 %*% theta[m0_pos]
m0 <- Xm1 %*% theta[m1_pos]
m1 <- (Z*Y - (Z - e) * m1)/e - ((1 - Z) * Y - (Z - e) * m0)/(1 - e)
rd_hat c(e_scores(theta[e_pos]),
m0_scores(theta[m0_pos]) * (Z == 0),
m1_scores(theta[m1_pos]) * (Z == 1),
- theta[length(theta)])
rd_hat
} }
This estFUN
presumes that the user will pass a list
containing fitted model objects for the three nuisance models: the
propensity score model and one regression model for each treatment
group. The functions grab_design_matrix
and
grab_fixed_formula
are geex
utilities for
extracting relevant pieces of a model object. The function
grab_psiFUN
converts a fitted model object to an estimating
function; for example, for a glm
object,
grab_psiFUN
uses the to create a function
of
theta
corresponding to the generalized linear model score
function. The m_estimate
function can be wrapped in another
function, wherein nuisance models are fit and passed to
m_estimate
.
<- function(data, propensity_formula, outcome_formula){
estimate_dr <- glm(propensity_formula, data = data, family = binomial)
e_model <- glm(outcome_formula, subset = (Z == 0), data = data)
m0_model <- glm(outcome_formula, subset = (Z == 1), data = data)
m1_model <- list(e = e_model, m0 = m0_model, m1 = m1_model)
models <- sum(unlist(lapply(models, function(x) length(coef(x))))) + 1
nparms
m_estimate(
estFUN = dr_estFUN,
data = data,
root_control = setup_root_control(start = rep(0, nparms)),
outer_args = list(models = models))
}
The following code provides a function to replicate the simulation settings of Lunceford and Davidian (2004).
library(mvtnorm)
<- c(-1, -1, 1, 1)
tau_0 <- tau_0 * -1
tau_1 <- matrix(
Sigma_X3 c(1, 0.5, -0.5, -0.5,
0.5, 1, -0.5, -0.5,
-0.5, -0.5, 1, 0.5,
-0.5, -0.5, 0.5, 1), ncol = 4, byrow = TRUE)
<- function(n, beta, nu, xi){
gen_data <- rbinom(n, 1, prob = 0.2)
X3 <- rbinom(n, 1, prob = (0.75 * X3 + (0.25 * (1 - X3))))
V3 <- rmvnorm(n, mean = rep(0, 4), Sigma_X3)
hold colnames(hold) <- c("X1", "V1", "X2", "V2")
<- cbind(hold, X3, V3)
hold <- apply(hold, 1, function(x){
hold 1:4] <- x[1:4] + tau_1^(x[5])*tau_0^(1 - x[5])
x[
x})<- t(hold)[, c("X1", "X2", "X3", "V1", "V2", "V3")]
hold <- cbind(Int = 1, hold)
X <- rbinom(n, 1, prob = plogis(X[, 1:4] %*% beta))
Z <- cbind(X[, 1:4], Z, X[, 5:7])
X data.frame(
Y = X %*% c(nu, xi) + rnorm(n),
-1])
X[ , }
To show that estimate_dr
correctly computes \(\hat{\Delta}_{DR}\), the results from
geex
can be compared to computing \(\hat{\Delta}_{DR}\) “by hand” for a
simulated dataset.
<- gen_data(1000, c(0, 0.6, -0.6, 0.6), c(0, -1, 1, -1, 2), c(-1, 1, 1))
dt <- estimate_dr(dt, Z ~ X1 + X2 + X3, Y ~ X1 + X2 + X3) geex_results
<- predict(glm(Z ~ X1 + X2 + X3, data = dt, family = "binomial"),
e type = "response")
<- predict(glm(Y ~ X1 + X2 + X3, data = dt, subset = Z==0), newdata = dt)
m0 <- predict(glm(Y ~ X1 + X2 + X3, data = dt, subset = Z==1), newdata = dt)
m1 <- with(dt, mean( (Z * Y - (Z - e) * m1)/e)) -
del_hat with(dt, mean(((1 - Z) * Y - (Z - e) * m0)/(1 - e)))
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.