The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
Firth’s (1993) penalized likelihood removes the leading \(O(1/n)\) bias from maximum likelihood estimates and produces finite estimates even under separation. fastglm implements the general mean-bias reduction of Kosmidis and Firth (2009, 2021), which extends Firth’s original logistic penalty to arbitrary GLM families. The method is activated with a single flag:
The penalty adds \(\frac{1}{2} \partial \log |X^\top W X| / \partial \beta\) to the score, which is equivalent to modifying the IRLS working response by
\[ \xi_i = \frac{\phi \, h_i \, \mu''(\eta_i) \, V(\mu_i)}{2 \, w_i \, [\mu'(\eta_i)]^3} \]
where \(h_i\) is the \(i\)-th diagonal of the hat matrix \(H = W^{1/2} X (X^\top W X)^{-1} X^\top
W^{1/2}\), \(\mu''(\eta)\) is the second
derivative of the inverse link, \(V(\mu)\) is the variance function, \(w_i\) is the prior weight, and \(\phi\) is the dispersion (1 for binomial
and Poisson families; estimated iteratively for Gaussian, Gamma, and
inverse Gaussian). This is the AS_mean adjustment of Kosmidis and Firth
(2009), the same method used by
brglm2::brglmFit(type = "AS_mean").
The adjustment is computed once per IRLS iteration using the leverages from the weighted least-squares decomposition that fastglm already performs, so the overhead relative to an unpenalized fit is modest.
The canonical application is logistic regression with (quasi-)
separation, where the standard MLE diverges. The sex2
dataset from logistf (Heinze and Schemper, 2002)
demonstrates:
data(sex2, package = "logistf")
X <- model.matrix(~ age + oc + vic + vicl + vis + dia, data = sex2)
y <- sex2$case
fit_f <- fastglm(X, y, family = binomial(), firth = TRUE)
fit_l <- logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
data = sex2, pl = FALSE, plconf = NULL,
control = logistf::logistf.control(
lconv = 1e-12, gconv = 1e-12, xconv = 1e-12,
maxit = 200L))
cbind(fastglm = unname(coef(fit_f)), logistf = unname(coef(fit_l)))
#> fastglm logistf
#> [1,] 0.12025405 0.12025405
#> [2,] -1.10598133 -1.10598133
#> [3,] -0.06881673 -0.06881673
#> [4,] 2.26887465 2.26887465
#> [5,] -2.11140819 -2.11140819
#> [6,] -0.78831695 -0.78831695
#> [7,] 3.09601183 3.09601183
max(abs(unname(coef(fit_f)) - unname(coef(fit_l))))
#> [1] 1.478671e-09Coefficients agree to roughly 1e-7. The runtime advantage is substantial because fastglm’s Firth solver reuses the C++ IRLS infrastructure:
mb <- microbenchmark::microbenchmark(
fastglm = fastglm(X, y, family = binomial(), firth = TRUE),
logistf = logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
data = sex2, pl = FALSE, plconf = NULL),
times = 10L
)
print(mb, unit = "ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> fastglm 0.255717 0.265229 0.2795995 0.273798 0.283351 0.349320 10 a
#> logistf 1.263456 1.279077 1.3051735 1.287584 1.313230 1.411876 10 bfirth = TRUE works with all standard R families and link
functions. For each family below, we verify that fastglm
reproduces the bias-reduced coefficients from
brglm2::brglmFit(type = "AS_mean") — both implement the
same AS_mean adjustment, so the coefficients should agree to near
machine precision.
library(brglm2)
set.seed(123)
n <- 500
x1 <- rnorm(n); x2 <- rnorm(n)
X <- cbind(1, x1, x2)
eta_true <- 0.5 + 0.3 * x1 - 0.2 * x2y_bin <- rbinom(n, 1, plogis(eta_true))
df <- data.frame(y = y_bin, x1 = x1, x2 = x2)
for (lnk in c("logit", "probit", "cloglog")) {
fam <- binomial(lnk)
f <- fastglm(X, y_bin, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = df, family = fam, method = "brglmFit",
type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("binomial %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> binomial logit max|diff| = 2.29e-12
#> binomial probit max|diff| = 1.10e-11
#> binomial cloglog max|diff| = 3.09e-11y_pois <- rpois(n, exp(eta_true))
for (lnk in c("log", "sqrt")) {
fam <- poisson(lnk)
f <- fastglm(X, y_pois, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_pois, x1 = x1, x2 = x2),
family = fam, method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("poisson %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> poisson log max|diff| = 2.13e-12
#> poisson sqrt max|diff| = 6.30e-11y_gam <- rgamma(n, shape = 5, rate = 5 / exp(eta_true))
for (lnk in c("log", "inverse")) {
fam <- Gamma(lnk)
f <- fastglm(X, y_gam, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_gam, x1 = x1, x2 = x2),
family = fam, method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("Gamma %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> Gamma log max|diff| = 2.10e-05
#> Gamma inverse max|diff| = 1.67e-05y_gauss <- eta_true + rnorm(n, 0, 0.5)
for (lnk in c("identity", "log")) {
fam <- gaussian(lnk)
y0 <- if (lnk == "log") pmax(y_gauss, 0.01) else y_gauss
f <- fastglm(X, y0, family = fam, firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y0, x1 = x1, x2 = x2),
family = fam, method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("gaussian %-8s max|diff| = %.2e\n", lnk,
max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> gaussian identity max|diff| = 1.67e-16
#> gaussian log max|diff| = 1.78e-06mu_ig <- exp(eta_true)
y_ig <- statmod::rinvgauss(n, mean = mu_ig, shape = 5)
y_ig[y_ig <= 0] <- 0.01
f <- fastglm(X, y_ig, family = inverse.gaussian("log"), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_ig, x1 = x1, x2 = x2),
family = inverse.gaussian("log"), method = "brglmFit", type = "AS_mean",
control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("inv.gauss log max|diff| = %.2e\n",
max(abs(unname(coef(f)) - unname(coef(b))))))
#> inv.gauss log max|diff| = 1.16e-06The standard errors reported by fastglm with
firth = TRUE come from the unscaled \((X^\top W X)^{-1}\) at convergence, the
same quantity that brglm2 uses. For unit-dispersion
families (binomial, Poisson), these are the Firth-penalized standard
errors directly; for other families, the dispersion is estimated as
\(\hat\phi = D / (n - p)\) and applied
to the variance-covariance matrix.
f <- fastglm(X, y_bin, family = binomial(), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = df, family = binomial(), method = "brglmFit",
type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))
cbind(fastglm_SE = f$se, brglm2_SE = summary(b)$coefficients[, "Std. Error"])
#> fastglm_SE brglm2_SE
#> (Intercept) 0.09423309 0.09423309
#> x1 0.09910290 0.09910290
#> x2 0.09527990 0.09527990
max(abs(f$se - summary(b)$coefficients[, "Std. Error"]))
#> [1] 8.0283e-14The Firth-penalized deviance is \(D^* = D - \log |X^\top W X|\), where \(D\) is the standard deviance and the log-determinant comes from the information matrix at the penalized MLE. Both the standard and penalized deviances are reported:
A timing comparison of fastglm(firth = TRUE) against
brglm2::brglmFit(type = "AS_mean") across families with
n = 10000 observations and p = 5
covariates:
set.seed(123)
n <- 10000; p <- 5
x1 <- rnorm(n); x2 <- rnorm(n); x3 <- rnorm(n)
x4 <- rnorm(n); x5 <- rnorm(n)
X <- cbind(1, x1, x2, x3, x4, x5)
eta <- 0.5 + 0.3 * x1 - 0.2 * x2 + 0.1 * x3
families <- list(
`binomial logit` = list(fam = binomial("logit"),
y = rbinom(n, 1, plogis(eta))),
`binomial probit` = list(fam = binomial("probit"),
y = rbinom(n, 1, plogis(eta))),
`binomial cloglog` = list(fam = binomial("cloglog"),
y = rbinom(n, 1, plogis(eta))),
`poisson log` = list(fam = poisson("log"),
y = rpois(n, exp(eta))),
`Gamma log` = list(fam = Gamma("log"),
y = rgamma(n, shape = 5, rate = 5 / exp(eta))),
`gaussian identity`= list(fam = gaussian("identity"),
y = eta + rnorm(n, 0, 0.5))
)
df <- data.frame(y = 0, x1 = x1, x2 = x2, x3 = x3, x4 = x4, x5 = x5)
results <- list()
for (nm in names(families)) {
fi <- families[[nm]]
df$y <- fi$y
mb <- microbenchmark::microbenchmark(
fastglm = fastglm(X, fi$y, family = fi$fam, firth = TRUE),
brglm2 = glm(y ~ x1 + x2 + x3 + x4 + x5, data = df,
family = fi$fam, method = "brglmFit", type = "AS_mean"),
times = 10L
)
s <- summary(mb, unit = "ms")
cat(sprintf("%-20s fastglm=%7.2f ms brglm2=%7.2f ms speedup=%.1fx\n",
nm, s$median[1], s$median[2], s$median[2] / s$median[1]))
}
#> binomial logit fastglm= 3.22 ms brglm2= 13.31 ms speedup=4.1x
#> binomial probit fastglm= 4.22 ms brglm2= 15.77 ms speedup=3.7x
#> binomial cloglog fastglm= 5.58 ms brglm2= 16.75 ms speedup=3.0x
#> poisson log fastglm= 3.66 ms brglm2= 121.21 ms speedup=33.1x
#> Gamma log fastglm= 3.64 ms brglm2= 100.23 ms speedup=27.6x
#> gaussian identity fastglm= 1.02 ms brglm2= 21.17 ms speedup=20.7xFirth, D. (1993). Bias reduction of maximum likelihood estimates. Biometrika, 80(1), 27–38. doi:10.1093/biomet/80.1.27
Heinze, G. and Schemper, M. (2002). A solution to the problem of separation in logistic regression. Statistics in Medicine, 21(16), 2409–2419. doi:10.1002/sim.1047
Kosmidis, I. and Firth, D. (2009). Bias reduction in exponential family nonlinear models. Biometrika, 96(4), 793–804. doi:10.1093/biomet/asp055
Kosmidis, I. and Firth, D. (2021). Jeffreys-prior penalty, finiteness and shrinkage in binomial-response generalized linear models. Biometrika, 108(1), 71–82. doi:10.1093/biomet/asaa052
Kosmidis, I. (2014). Bias in parametric estimation: reduction and useful side-effects. WIREs Computational Statistics, 6(3), 185–196. doi:10.1002/wics.1296
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.