The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
This repository hosts the code for the MLBC package, implementing bias corrction methods described in Battaglia, Christensen, Hansen & Sacher (2024). A sample application of this package can be found in the file ex1.Rmd.
## Getting Started This package can be installed from CRAN by typing
> install.packages("MLBC")
into the R console. The core
functions of the package are:
## ols_bca This procedure first computes the standard OLS estimator on a design matrix (Xhat), the first column of which contains AI/ML-generated binary labels, and then applies an additive correction based on an estimate (fpr) of the false-positive rate computed externally. The method also adjusts the variance estimator with a finite-sample correction term to account for the uncertainty in the bias estimation.
Parameters
fpr
.TRUE
, an
intercept column of 1’s is prepended.Returns An object of class mlbc_fit and subclass mlbc_bca, a list with elements: - coef: Numeric vector of bias-corrected coefficients (intercept first, if requested). - vcov: Variance–covariance matrix of those coefficients.
## ols_bcm This procedure first computes the standard OLS estimator on a design matrix (Xhat), the first column of which contains AI/ML-generated binary labels, and then applies an additive correction based on an estimate (fpr) of the false-positive rate computed externally. The method also adjusts the variance estimator with a finite-sample correction term to account for the uncertainty in the bias estimation. Parameters
fpr
.TRUE
, an
intercept column of 1’s is prepended.Returns An object of class mlbc_fit and subclass mlbc_bcm, a list with elements: - coef: Numeric vector of bias-corrected coefficients (intercept first, if requested). - vcov: Variance–covariance matrix of those coefficients.
## one_step
This method jointly estimates the upstream (measurement) and downstream (regression) models using only the unlabeled likelihood. Leveraging TMB, it minimizes the negative log-likelihood to obtain the regression coefficients. The variance is then approximated via the inverse Hessian at the optimum.
Parameters - Y: Numeric response
vector - Xhat: Numeric matrix of regressors excluding
the intercept. The first column should be the ML-generated regressor to
correct - homoskedastic: logical; if True, assume a
single error variance - distribution: character; one of
“normal”, “t”, “laplace”, “gamma”, or “beta”. Specifies which
conditional density to use for residuals in the likelihood estimation -
nu: numeric; degrees of freedom (only used if
distribution = "t"
) - gshape, gscale:
numeric; shape 7 scale for gamma (only used if distribution
= “gamma”) - ba, bb: numeric; alpha & beta for beta
distribution (only if distribution
= “beta) -
intercept: logical; if True, an intercept term will be
estimated
Returns An object of class mlbc_fit and subclass mlbc_onestep, a list with elements: - coef: Numeric vector of bias-corrected coefficients (intercept first, if requested). - vcov: Variance–covariance matrix of those coefficients.
library(MLBC)
<- 1000
nsim <- 16000
n <- 1000
m <- 0.05
p <- 1
kappa <- kappa / sqrt(n)
fpr
<- 10
b0 <- 1
b1 <- 0.3
a0 <- 0.5
a1
<- c(0.0, 0.5, 0.5)
alpha <- c(0.0, 2.0, 4.0)
beta
#pre-allocated storage
<- array(0, dim = c(nsim, 9, 2))
B <- array(0, dim = c(nsim, 9, 2))
S
<- function(b, V, i, method_idx) {
update_results for (j in 1:2) {
<<- b[j]
B[i, method_idx, j] <<- sqrt(max(V[j,j], 0))
S[i, method_idx, j]
} }
<- function(n, m, p, fpr, b0, b1, a0, a1) {
generate_data <- n + m
N <- numeric(N)
X <- numeric(N)
Xhat <- runif(N)
u
for (j in seq_len(N)) {
if (u[j] <= fpr) X[j] <- 1
else if (u[j] <= 2*fpr) Xhat[j]<- 1
else if (u[j] <= p + fpr) { # true positive
<- 1
X[j] <- 1
Xhat[j]
}
}
<- rnorm(N) # N(0,1)
eps # heteroskedastic noise: σ₁ when X=1, σ₀ when X=0
<- b0 + b1*X + (a1*X + a0*(1 - X)) * eps
Y
# split into train vs test
<- Y[ 1:n ]
train_Y <- cbind(Xhat[ 1:n], rep(1, n))
train_X <- Y[(n+1):N ]
test_Y <- cbind(Xhat[(n+1):N], rep(1, m))
test_Xhat <- cbind(X[(n+1):N], rep(1, m))
test_X
list(
train_Y = train_Y,
train_X = train_X,
test_Y = test_Y,
test_Xhat = test_Xhat,
test_X = test_X
) }
for (i in seq_len(nsim)) {
<- generate_data(n, m, p, fpr, b0, b1, a0, a1)
dat <- dat$train_Y; tX <- dat$train_X
tY <- dat$test_Y; eXhat <- dat$test_Xhat; eX <- dat$test_X
eY
# 1) OLS on unlabeled (X̂)
<- ols(tY, tX)
ols_res update_results(ols_res$coef, ols_res$vcov, i, 1)
# 2) OLS on labeled (true X)
<- ols(eY, eX)
ols_res update_results(ols_res$coef, ols_res$vcov, i, 2)
# 3–8) Additive & multiplicative bias‑corrections
<- mean(eXhat[,1] * (1 - eX[,1]))
fpr_hat for (j in 1:3) {
<- (fpr_hat * m + alpha[j]) / (m + alpha[j] + beta[j])
fpr_bayes
<- ols_bca(tY, tX, fpr_bayes, m)
bca_res update_results(bca_res$coef, bca_res$vcov, i, 2 + j)
<- ols_bcm(tY, tX, fpr_bayes, m)
bcm_res update_results(bcm_res$coef, bcm_res$vcov, i, 5 + j)
}
# 9) One‑step estimator
<- one_step(tY, tX)
one_res update_results(one_res$coef, one_res$cov, i, 9)
if (i %% 100 == 0) {
message("Completed ", i, " of ", nsim, " sims")
} }
## Completed 100 of 1000 sims
## Completed 200 of 1000 sims
## Completed 300 of 1000 sims
## Completed 400 of 1000 sims
## Completed 500 of 1000 sims
## Completed 600 of 1000 sims
## Completed 700 of 1000 sims
## Completed 800 of 1000 sims
## Completed 900 of 1000 sims
## Completed 1000 of 1000 sims
<- function(bgrid, b, se) {
coverage <- length(bgrid)
n_grid <- numeric(n_grid)
cvg for (i in seq_along(bgrid)) {
<- bgrid[i]
val <- mean(abs(b - val) <= 1.96 * se)
cvg[i]
}
cvg
}
<- 1.0
true_beta1
<- c(
methods "OLS ĥ" = 1,
"OLS θ̂" = 2,
"BCA-0" = 3,
"BCA-1" = 4,
"BCA-2" = 5,
"BCM-0" = 6,
"BCM-1" = 7,
"BCM-2" = 8,
"OSU" = 9
)
<- sapply(methods, function(col) {
cov_dict <- B[, col, 1]
slopes <- S[, col, 1]
ses mean(abs(slopes - true_beta1) <= 1.96 * ses)
})
<- setNames(cov_dict, names(methods))
cov_series print(cov_series)
## OLS ĥ OLS θ̂ BCA-0 BCA-1 BCA-2 BCM-0 BCM-1 BCM-2 OSU
## 0.000 0.952 0.841 0.890 0.888 0.887 0.912 0.911 0.951
<- names(methods)
method_names
<- c("Beta1","Beta0")
coef_names
<- dim(B)[2]
nmethods <- data.frame(Method = method_names, stringsAsFactors = FALSE)
df
$Est_Beta1 <- NA_real_
df$SE_Beta1 <- NA_real_
df$CI95_Beta1 <- NA_character_
df$Est_Beta0 <- NA_real_
df$SE_Beta0 <- NA_real_
df$CI95_Beta0 <- NA_character_
df
for(i in seq_len(nmethods)) {
<- B[, i, 1]; se1 <- S[, i, 1]
est1 <- B[, i, 2]; se0 <- S[, i, 2]
est0
<- quantile(est1, probs = c(0.025, 0.975))
ci1 <- quantile(est0, probs = c(0.025, 0.975))
ci0
$Est_Beta1[i] <- mean(est1)
df$SE_Beta1[i] <- mean(se1)
df$CI95_Beta1[i] <- sprintf("[%0.3f, %0.3f]", ci1[1], ci1[2])
df
$Est_Beta0[i] <- mean(est0)
df$SE_Beta0[i] <- mean(se0)
df$CI95_Beta0[i] <- sprintf("[%0.3f, %0.3f]", ci0[1], ci0[2])
df
}
print(df)
## Method Est_Beta1 SE_Beta1 CI95_Beta1 Est_Beta0 SE_Beta0
## 1 OLS ĥ 0.8345230 0.02126622 [0.793, 0.876] 10.008229 0.002559755
## 2 OLS θ̂ 0.9979584 0.07110668 [0.862, 1.139] 9.999644 0.009712562
## 3 BCA-0 0.9733887 0.05192212 [0.879, 1.097] 10.001295 0.003527102
## 4 BCA-1 0.9818128 0.05328685 [0.888, 1.105] 10.000874 0.003578399
## 5 BCA-2 0.9815196 0.05324235 [0.888, 1.105] 10.000889 0.003576679
## 6 BCM-0 1.0064253 0.06465110 [0.886, 1.197] 9.999649 0.003963054
## 7 BCM-1 1.0188792 0.06713945 [0.896, 1.213] 9.999027 0.004060987
## 8 BCM-2 1.0184172 0.06705113 [0.896, 1.212] 9.999050 0.004057390
## 9 OSU 0.9985015 0.03102350 [0.934, 1.055] 9.999928 0.002500163
## CI95_Beta0
## 1 [10.003, 10.013]
## 2 [9.981, 10.018]
## 3 [9.994, 10.007]
## 4 [9.994, 10.007]
## 5 [9.994, 10.007]
## 6 [9.990, 10.007]
## 7 [9.989, 10.007]
## 8 [9.989, 10.007]
## 9 [9.995, 10.005]
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.