The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Firth Bias-Reduced GLMs with ‘fastglm’

Jared Huling

2026-06-05

Firth’s (1993) penalized likelihood removes the leading \(O(1/n)\) bias from maximum likelihood estimates and produces finite estimates even under separation. fastglm implements the general mean-bias reduction of Kosmidis and Firth (2009, 2021), which extends Firth’s original logistic penalty to arbitrary GLM families. The method is activated with a single flag:

The penalty adds \(\frac{1}{2} \partial \log |X^\top W X| / \partial \beta\) to the score, which is equivalent to modifying the IRLS working response by

\[ \xi_i = \frac{\phi \, h_i \, \mu''(\eta_i) \, V(\mu_i)}{2 \, w_i \, [\mu'(\eta_i)]^3} \]

where \(h_i\) is the \(i\)-th diagonal of the hat matrix \(H = W^{1/2} X (X^\top W X)^{-1} X^\top W^{1/2}\), \(\mu''(\eta)\) is the second derivative of the inverse link, \(V(\mu)\) is the variance function, \(w_i\) is the prior weight, and \(\phi\) is the dispersion (1 for binomial and Poisson families; estimated iteratively for Gaussian, Gamma, and inverse Gaussian). This is the AS_mean adjustment of Kosmidis and Firth (2009), the same method used by brglm2::brglmFit(type = "AS_mean").

The adjustment is computed once per IRLS iteration using the leverages from the weighted least-squares decomposition that fastglm already performs, so the overhead relative to an unpenalized fit is modest.

Logistic regression under separation

The canonical application is logistic regression with (quasi-) separation, where the standard MLE diverges. The sex2 dataset from logistf (Heinze and Schemper, 2002) demonstrates:

data(sex2, package = "logistf")
X <- model.matrix(~ age + oc + vic + vicl + vis + dia, data = sex2)
y <- sex2$case

fit_f <- fastglm(X, y, family = binomial(), firth = TRUE)
fit_l <- logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
                          data = sex2, pl = FALSE, plconf = NULL,
                          control = logistf::logistf.control(
                              lconv = 1e-12, gconv = 1e-12, xconv = 1e-12,
                              maxit = 200L))

cbind(fastglm = unname(coef(fit_f)), logistf = unname(coef(fit_l)))
#>          fastglm     logistf
#> [1,]  0.12025405  0.12025405
#> [2,] -1.10598133 -1.10598133
#> [3,] -0.06881673 -0.06881673
#> [4,]  2.26887465  2.26887465
#> [5,] -2.11140819 -2.11140819
#> [6,] -0.78831695 -0.78831695
#> [7,]  3.09601183  3.09601183
max(abs(unname(coef(fit_f)) - unname(coef(fit_l))))
#> [1] 1.478671e-09

Coefficients agree to roughly 1e-7. The runtime advantage is substantial because fastglm’s Firth solver reuses the C++ IRLS infrastructure:

mb <- microbenchmark::microbenchmark(
    fastglm = fastglm(X, y, family = binomial(), firth = TRUE),
    logistf = logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
                               data = sex2, pl = FALSE, plconf = NULL),
    times = 10L
)
print(mb, unit = "ms")
#> Unit: milliseconds
#>     expr      min       lq      mean   median       uq      max neval cld
#>  fastglm 0.255717 0.265229 0.2795995 0.273798 0.283351 0.349320    10  a 
#>  logistf 1.263456 1.279077 1.3051735 1.287584 1.313230 1.411876    10   b

General GLM families

firth = TRUE works with all standard R families and link functions. For each family below, we verify that fastglm reproduces the bias-reduced coefficients from brglm2::brglmFit(type = "AS_mean") — both implement the same AS_mean adjustment, so the coefficients should agree to near machine precision.

library(brglm2)
set.seed(123)
n <- 500
x1 <- rnorm(n);  x2 <- rnorm(n)
X <- cbind(1, x1, x2)
eta_true <- 0.5 + 0.3 * x1 - 0.2 * x2

Binomial (logit, probit, cloglog)

y_bin <- rbinom(n, 1, plogis(eta_true))
df <- data.frame(y = y_bin, x1 = x1, x2 = x2)

for (lnk in c("logit", "probit", "cloglog")) {
    fam <- binomial(lnk)
    f <- fastglm(X, y_bin, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = df, family = fam, method = "brglmFit",
             type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("binomial %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> binomial logit    max|diff| = 2.29e-12
#> binomial probit   max|diff| = 1.10e-11
#> binomial cloglog  max|diff| = 3.09e-11

Poisson (log, sqrt)

y_pois <- rpois(n, exp(eta_true))

for (lnk in c("log", "sqrt")) {
    fam <- poisson(lnk)
    f <- fastglm(X, y_pois, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = data.frame(y = y_pois, x1 = x1, x2 = x2),
             family = fam, method = "brglmFit", type = "AS_mean",
             control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("poisson  %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> poisson  log      max|diff| = 2.13e-12
#> poisson  sqrt     max|diff| = 6.30e-11

Gamma (log, inverse)

y_gam <- rgamma(n, shape = 5, rate = 5 / exp(eta_true))

for (lnk in c("log", "inverse")) {
    fam <- Gamma(lnk)
    f <- fastglm(X, y_gam, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = data.frame(y = y_gam, x1 = x1, x2 = x2),
             family = fam, method = "brglmFit", type = "AS_mean",
             control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("Gamma    %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> Gamma    log      max|diff| = 2.10e-05
#> Gamma    inverse  max|diff| = 1.67e-05

Gaussian (identity, log)

y_gauss <- eta_true + rnorm(n, 0, 0.5)

for (lnk in c("identity", "log")) {
    fam <- gaussian(lnk)
    y0 <- if (lnk == "log") pmax(y_gauss, 0.01) else y_gauss
    f <- fastglm(X, y0, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = data.frame(y = y0, x1 = x1, x2 = x2),
             family = fam, method = "brglmFit", type = "AS_mean",
             control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("gaussian %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> gaussian identity max|diff| = 1.67e-16
#> gaussian log      max|diff| = 1.78e-06

Inverse Gaussian (log)

mu_ig <- exp(eta_true)
y_ig <- statmod::rinvgauss(n, mean = mu_ig, shape = 5)
y_ig[y_ig <= 0] <- 0.01

f <- fastglm(X, y_ig, family = inverse.gaussian("log"), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_ig, x1 = x1, x2 = x2),
         family = inverse.gaussian("log"), method = "brglmFit", type = "AS_mean",
         control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("inv.gauss log      max|diff| = %.2e\n",
            max(abs(unname(coef(f)) - unname(coef(b))))))
#> inv.gauss log      max|diff| = 1.16e-06

Standard errors

The standard errors reported by fastglm with firth = TRUE come from the unscaled \((X^\top W X)^{-1}\) at convergence, the same quantity that brglm2 uses. For unit-dispersion families (binomial, Poisson), these are the Firth-penalized standard errors directly; for other families, the dispersion is estimated as \(\hat\phi = D / (n - p)\) and applied to the variance-covariance matrix.

f <- fastglm(X, y_bin, family = binomial(), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = df, family = binomial(), method = "brglmFit",
         type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))

cbind(fastglm_SE = f$se, brglm2_SE = summary(b)$coefficients[, "Std. Error"])
#>             fastglm_SE  brglm2_SE
#> (Intercept) 0.09423309 0.09423309
#> x1          0.09910290 0.09910290
#> x2          0.09527990 0.09527990
max(abs(f$se - summary(b)$coefficients[, "Std. Error"]))
#> [1] 8.0283e-14

Penalized deviance

The Firth-penalized deviance is \(D^* = D - \log |X^\top W X|\), where \(D\) is the standard deviance and the log-determinant comes from the information matrix at the penalized MLE. Both the standard and penalized deviances are reported:

f <- fastglm(X, y_bin, family = binomial(), firth = TRUE)
c(deviance = f$deviance,
  log.det.XtWX = f$log.det.XtWX,
  penalized.deviance = f$penalized.deviance)
#>           deviance       log.det.XtWX penalized.deviance 
#>          644.08511           14.05578          630.02933

Speed comparison across families

A timing comparison of fastglm(firth = TRUE) against brglm2::brglmFit(type = "AS_mean") across families with n = 10000 observations and p = 5 covariates:

set.seed(123)
n <- 10000;  p <- 5
x1 <- rnorm(n);  x2 <- rnorm(n); x3 <- rnorm(n)
x4 <- rnorm(n);  x5 <- rnorm(n)
X <- cbind(1, x1, x2, x3, x4, x5)
eta <- 0.5 + 0.3 * x1 - 0.2 * x2 + 0.1 * x3

families <- list(
    `binomial logit`   = list(fam = binomial("logit"),
                              y = rbinom(n, 1, plogis(eta))),
    `binomial probit`  = list(fam = binomial("probit"),
                              y = rbinom(n, 1, plogis(eta))),
    `binomial cloglog` = list(fam = binomial("cloglog"),
                              y = rbinom(n, 1, plogis(eta))),
    `poisson log`      = list(fam = poisson("log"),
                              y = rpois(n, exp(eta))),
    `Gamma log`        = list(fam = Gamma("log"),
                              y = rgamma(n, shape = 5, rate = 5 / exp(eta))),
    `gaussian identity`= list(fam = gaussian("identity"),
                              y = eta + rnorm(n, 0, 0.5))
)

df <- data.frame(y = 0, x1 = x1, x2 = x2, x3 = x3, x4 = x4, x5 = x5)
results <- list()
for (nm in names(families)) {
    fi <- families[[nm]]
    df$y <- fi$y
    mb <- microbenchmark::microbenchmark(
        fastglm = fastglm(X, fi$y, family = fi$fam, firth = TRUE),
        brglm2  = glm(y ~ x1 + x2 + x3 + x4 + x5, data = df,
                      family = fi$fam, method = "brglmFit", type = "AS_mean"),
        times = 10L
    )
    s <- summary(mb, unit = "ms")
    cat(sprintf("%-20s fastglm=%7.2f ms  brglm2=%7.2f ms  speedup=%.1fx\n",
                nm, s$median[1], s$median[2], s$median[2] / s$median[1]))
}
#> binomial logit       fastglm=   3.22 ms  brglm2=  13.31 ms  speedup=4.1x
#> binomial probit      fastglm=   4.22 ms  brglm2=  15.77 ms  speedup=3.7x
#> binomial cloglog     fastglm=   5.58 ms  brglm2=  16.75 ms  speedup=3.0x
#> poisson log          fastglm=   3.66 ms  brglm2= 121.21 ms  speedup=33.1x
#> Gamma log            fastglm=   3.64 ms  brglm2= 100.23 ms  speedup=27.6x
#> gaussian identity    fastglm=   1.02 ms  brglm2=  21.17 ms  speedup=20.7x

References

Firth, D. (1993). Bias reduction of maximum likelihood estimates. Biometrika, 80(1), 27–38. doi:10.1093/biomet/80.1.27
Heinze, G. and Schemper, M. (2002). A solution to the problem of separation in logistic regression. Statistics in Medicine, 21(16), 2409–2419. doi:10.1002/sim.1047
Kosmidis, I. and Firth, D. (2009). Bias reduction in exponential family nonlinear models. Biometrika, 96(4), 793–804. doi:10.1093/biomet/asp055
Kosmidis, I. and Firth, D. (2021). Jeffreys-prior penalty, finiteness and shrinkage in binomial-response generalized linear models. Biometrika, 108(1), 71–82. doi:10.1093/biomet/asaa052
Kosmidis, I. (2014). Bias in parametric estimation: reduction and useful side-effects. WIREs Computational Statistics, 6(3), 185–196. doi:10.1002/wics.1296

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.