The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

R-CMD-check

Overview

Details

A small package for joint calibration of totals and quantiles (see Beręsewicz and Szymkowiak (2023) working paper for details). The package combines the following approaches:

which allows to calibrate weights to known (or estimated) totals and quantiles jointly. As an backend for calibration sampling (sampling::calib), laeken (laeken::calibWeights), survey (survey::grake) or ebal (ebal::eb) package can be used. One can also apply empirical likelihood using codes from Wu (2005) with support of stats::constrOptim as used in Zhang, Han and Wu (2022).

backend method function called
sampling c("raking", "linear", "logit", "truncated") sampling::calib
laeken c("raking", "linear", "logit") laeken::calibWeights
survey c("raking", "linear", "logit", "sinh") survey::grake
ebal eb ebal::eb
base el R code and stats::constrOptim

Currently supports:

Further plans:

Funding

Work on this package is supported by the the National Science Centre, OPUS 22 grant no. 2020/39/B/HS4/00941.

Installation

You can install the development version of jointCalib from GitHub with:

# install.packages("remotes")
remotes::install_github("ncn-foreigners/jointCalib")

Examples

Load packages

library(jointCalib)
library(survey)
library(laeken)
library(ebal)

Example 1 – census case

Based on Haziza, D., and Lesage, É. (2016). A discussion of weighting procedures for unit nonresponse. Journal of Official Statistics, 32(1), 129-145.

set.seed(20230817)
N <- 1000
x <- runif(N, 0, 80)
#y <- 1000+10*x+rnorm(N, 0, 300)
#y <- exp(-0.1 + 0.1*x) + rnorm(N, 0, 300)
#y <- rbinom(N, 1, prob = 1/(exp(-0.5*(x-55))+1))
y <- 1300-(x-40)^2 + rnorm(N, 0, 300)
#p <- rbinom(N, 1, prob = 0.2+0.6*(1 + exp(-5 + x/8))^-1)
p <- rbinom(N, 1, prob = 0.07+0.45*(x/40-1)^2+0.0025*x)
#p <- rbinom(N, 1, prob = (1.2+0.024*x)^-1)
#p <- rbinom(N, 1, prob = exp(-0.2 - 0.014*x))
probs <- seq(0.1, 0.9, 0.1)
quants_known <- list(x=quantile(x, probs))
totals_known <- c(x=sum(x))
df <- data.frame(x, y, p)
df_resp <- df[df$p == 1, ]
df_resp$d <- N/nrow(df_resp)
y_quant_true <- quantile(y, probs)
head(df_resp)
#>           x        y p        d
#> 6  12.35687 444.9053 1 3.134796
#> 7  61.90319 403.9473 1 3.134796
#> 13 60.96079 923.4975 1 3.134796
#> 14 76.85300 124.4110 1 3.134796
#> 18 71.52828 422.0934 1 3.134796
#> 19 65.32808 740.4801 1 3.134796

Using jointCalib package

example 1a: calibrate only quantiles (deciles)

result1 <- joint_calib(formula_quantiles = ~x,
                      data=df_resp,
                      dweights=df_resp$d,
                      N = N,
                      pop_quantiles = quants_known,
                      method = "linear",
                      backend = "sampling")
result1
#> Weights calibrated using: linear withsamplingbackend.
#> Summary statistics for g-weights:
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4562  0.6773  0.8180  1.0000  1.4500  2.4538 
#> Totals and precision (abs diff: 4.574665e-10)
#>       totals    precision
#>  [1,]  1e+03 4.557705e-10
#>  [2,]  1e-01 5.020984e-14
#>  [3,]  2e-01 1.055545e-13
#>  [4,]  3e-01 1.324496e-13
#>  [5,]  4e-01 1.580958e-13
#>  [6,]  5e-01 1.765255e-13
#>  [7,]  6e-01 1.991740e-13
#>  [8,]  7e-01 2.304823e-13
#>  [9,]  8e-01 2.882139e-13
#> [10,]  9e-01 3.552714e-13
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result1$g*df_resp$d, probs))
#>          true       est
#> 10%  7.078067  7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274

example 1b: calibrate only quantiles (deciles)

result2 <- joint_calib(formula_totals = ~x,
                       formula_quantiles = ~x,
                       data = df_resp,
                       dweights = df_resp$d,
                       N = N,
                       pop_quantiles = quants_known,
                       pop_totals = totals_known,
                       method = "linear",
                       backend = "sampling")
summary(result2$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.3563  0.6208  0.8199  1.0000  1.4368  2.5384
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs))
#>          true       est
#> 10%  7.078067  7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 23.287657
#> 40% 31.641911 31.802986
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.062629
#> 90% 71.565441 71.567274

We can restrict weights to specific range using logit.

result3 <- joint_calib(formula_totals = ~x,
                       formula_quantiles = ~x,
                       data = df_resp,
                       dweights = df_resp$d,
                       N = N,
                       pop_quantiles = quants_known,
                       pop_totals = totals_known,
                       method = "logit",
                       backend = "sampling", 
                       maxit = 500,
                       bounds = c(0, 3))

summary(result3$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.3911  0.6254  0.8140  1.0000  1.4325  2.5186
data.frame(true = quants_known$x, est = weightedQuantile(df_resp$x, result3$g*df_resp$d, probs))
#>          true       est
#> 10%  7.078067  7.085003
#> 20% 14.831221 14.824424
#> 30% 23.146180 22.872078
#> 40% 31.641911 31.297922
#> 50% 39.033812 39.154276
#> 60% 47.527168 48.252065
#> 70% 54.984229 55.311953
#> 80% 64.073167 64.529683
#> 90% 71.565441 71.567274

Empirical likelihood method can be applied using the following code

result4a <- joint_calib(formula_quantiles = ~ x,
                        data = df_resp,
                        dweights = df_resp$d,
                        N = N,
                        pop_quantiles = quants_known,
                        method = "el",
                        backend = "base")
summary(result4a$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4563  0.6773  0.8180  1.0000  1.4500  2.4538

result4b <- joint_calib(formula_totals = ~ x,
                        formula_quantiles = ~ x,
                        data = df_resp,
                        dweights = df_resp$d,
                        N = N,
                        pop_quantiles = quants_known,
                        pop_totals = totals_known,
                        method = "el",
                        backend = "base")

summary(result4b$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4414  0.6584  0.8109  1.0000  1.4256  2.8513

result4c <- calib_el(X = result4b$Xs[, c(1, 11)],
                     d = df_resp$d, 
                     totals = c(N,totals_known),
                     maxit = 50,
                     tol = 1e-8)

summary(result4c)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.7438  0.7910  0.8848  1.0000  1.2455  1.4923
data.frame(true = quants_known$x, 
           est = weightedQuantile(df_resp$x, result2$g*df_resp$d, probs),
           est_el0 = weightedQuantile(df_resp$x, result4c*df_resp$d, probs),
           est_el1 = weightedQuantile(df_resp$x, result4a$g*df_resp$d, probs),
           est_el2 = weightedQuantile(df_resp$x, result4b$g*df_resp$d, probs))
#>          true       est   est_el0   est_el1   est_el2
#> 10%  7.078067  7.085003  3.640246  7.085003  7.085003
#> 20% 14.831221 14.824424  8.024948 14.824424 14.824424
#> 30% 23.146180 23.287657 14.499018 23.287657 23.287657
#> 40% 31.641911 31.802986 24.010501 31.802986 31.802986
#> 50% 39.033812 39.154276 39.154276 38.892342 39.154276
#> 60% 47.527168 48.252065 54.685438 48.252065 48.252065
#> 70% 54.984229 55.311953 63.084405 54.954054 54.954054
#> 80% 64.073167 64.062629 70.050593 64.062629 64.062629
#> 90% 71.565441 71.567274 75.663078 71.567274 71.567274

Entropy balancing method can be applied using the following code

result5 <- joint_calib(formula_quantiles = ~ x,
                        data = df_resp,
                        dweights = df_resp$d,
                        N = N,
                        pop_quantiles = quants_known,
                        method = "eb")
summary(result5$g)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4576  0.6775  0.8180  1.0003  1.4500  2.4538

Finally, compare all method with true Y distribution where

results_y <- data.frame(true_y = y_quant_true,
                        est_y1 = weightedQuantile(df_resp$y, result1$g * df_resp$d, probs),
                        est_y2 = weightedQuantile(df_resp$y, result2$g * df_resp$d, probs),
                        est_y3 = weightedQuantile(df_resp$y, result3$g * df_resp$d, probs),
                        est_y4 = weightedQuantile(df_resp$y, result4c * df_resp$d, probs),
                        est_y5 = weightedQuantile(df_resp$y, result4a$g * df_resp$d, probs),
                        est_y6 = weightedQuantile(df_resp$y, result4b$g * df_resp$d, probs),
                        est_y7 = weightedQuantile(df_resp$y, result5$g * df_resp$d, probs))

results_y
#>         true_y     est_y1     est_y2     est_y3     est_y4     est_y5
#> 10%  -56.97982  -36.18473  -36.18473  -36.18473 -149.00996  -36.18473
#> 20%  210.58301  228.75429  228.75429  228.75429   17.27107  228.75429
#> 30%  444.81652  413.22517  417.60155  413.22517  203.58302  413.22517
#> 40%  646.06099  622.82192  622.82192  622.82192  381.99560  622.82192
#> 50%  803.50842  789.89936  789.89936  789.89936  503.35849  789.89936
#> 60%  975.31803  925.84730  925.84730  925.84730  679.04530  925.84730
#> 70% 1123.44643 1023.75230 1023.75230 1023.75230  811.84389 1023.75230
#> 80% 1245.62645 1162.06504 1162.06504 1162.06504 1009.46703 1162.06504
#> 90% 1449.23819 1471.33301 1471.33301 1471.33301 1242.60357 1471.33301
#>         est_y6     est_y7
#> 10%  -36.18473  -36.18473
#> 20%  228.75429  228.75429
#> 30%  411.47088  413.22517
#> 40%  622.82192  622.82192
#> 50%  789.89936  789.89936
#> 60%  925.84730  925.84730
#> 70% 1023.75230 1023.75230
#> 80% 1162.06504 1162.06504
#> 90% 1471.33301 1471.33301

Here is the sum of squares and calibration of totals and means seems to be the best.

apply(results_y[, -1], 2, FUN = function(x) sum((x-results_y[,1])^2))
#>    est_y1    est_y2    est_y3    est_y4    est_y5    est_y6    est_y7 
#>  22342.87  22085.51  22342.87 547195.99  22342.87  22456.79  22342.87

Using survey package

example 1a: calibrate only quantiles (deciles)

A <- joint_calib_create_matrix(X_q = model.matrix(~x+0, df_resp), N = N, pop_quantiles = quants_known)
colnames(A) <- paste0("quant_", gsub("\\D", "", names(quants_known$x)))
A <- as.data.frame(A)
df_resp <- cbind(df_resp, A)
head(df_resp)
#>           x        y p        d quant_10 quant_20 quant_30 quant_40 quant_50
#> 6  12.35687 444.9053 1 3.134796        0    0.001    0.001    0.001    0.001
#> 7  61.90319 403.9473 1 3.134796        0    0.000    0.000    0.000    0.000
#> 13 60.96079 923.4975 1 3.134796        0    0.000    0.000    0.000    0.000
#> 14 76.85300 124.4110 1 3.134796        0    0.000    0.000    0.000    0.000
#> 18 71.52828 422.0934 1 3.134796        0    0.000    0.000    0.000    0.000
#> 19 65.32808 740.4801 1 3.134796        0    0.000    0.000    0.000    0.000
#>    quant_60 quant_70 quant_80 quant_90
#> 6     0.001    0.001    0.001    0.001
#> 7     0.000    0.000    0.001    0.001
#> 13    0.000    0.000    0.001    0.001
#> 14    0.000    0.000    0.000    0.000
#> 18    0.000    0.000    0.000    0.001
#> 19    0.000    0.000    0.000    0.001
m1 <- svydesign(ids = ~1, data = df_resp, weights = ~d)
quants_formula <- as.formula(paste("~", paste(colnames(A), collapse = "+")))
quants_formula
#> ~quant_10 + quant_20 + quant_30 + quant_40 + quant_50 + quant_60 + 
#>     quant_70 + quant_80 + quant_90
svytotal(quants_formula, m1)
#>            total     SE
#> quant_10 0.10972 0.0175
#> quant_20 0.23197 0.0237
#> quant_30 0.29154 0.0255
#> quant_40 0.34796 0.0267
#> quant_50 0.38871 0.0273
#> quant_60 0.43887 0.0278
#> quant_70 0.50784 0.0280
#> quant_80 0.63323 0.0270
#> quant_90 0.78100 0.0232

Calibration using calibrate

pop_totals <- c(N, probs)
names(pop_totals) <- c("(Intercept)", colnames(A))
m1_cal <- calibrate(m1, quants_formula, pop_totals)
svytotal(quants_formula, m1_cal)
#>          total SE
#> quant_10   0.1  0
#> quant_20   0.2  0
#> quant_30   0.3  0
#> quant_40   0.4  0
#> quant_50   0.5  0
#> quant_60   0.6  0
#> quant_70   0.7  0
#> quant_80   0.8  0
#> quant_90   0.9  0

Calibration using grake (low level)

g1 <- grake(mm = as.matrix(cbind(1, A)),
            ww = df_resp$d,
            calfun = cal.linear,
            population = pop_totals,
            bounds = list(lower = -Inf, upper = Inf),
            epsilon = 1e-7,
            verbose = FALSE,
            maxit = 50,
            variance = NULL)
summary(as.numeric(g1))
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>  0.4562  0.6773  0.8180  1.0000  1.4500  2.4538

Example 2 – non-probability samples

Example 3 – observational studies / causal inference

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.