library(tidyverse)
library(DasGuptR)
<- data.frame(
eg.wang2000 pop = rep(c("male", "female"), e = 12),
ethnicity = rep(1:3, e = 4),
age_group = rep(1:4, 3),
size = c(
130, 1305, 1539, 316, 211, 697, 334, 48, 105, 475, 424, 49,
70, 604, 428, 43, 55, 127, 44, 9, 72, 178, 103, 12
),rate = c(
12.31, 34.90, 52.91, 44.44, 16.67, 36.40, 51.20, 41.67, 12.38, 19.20, 21.23, 12.50,
17.14, 35.55, 48.71, 55.81, 14.55, 39.37, 32.56, 55.56, 22.22, 20.34, 13.59, 8.33
) )
Bootstrapping standardized rates and decomposition effects
The basic premise of bootstrapping Das Gupta’s rate decomposition can be thought of as expanding out to unit-level data, re-sampling, then aggregating back up.
Data from Wang 2000:
To do this with the DasGuptR package, we must first create a function to essentially “uncount” data into individual level data (where nrow = size of total population), then re-sample with replacement, then aggregate back up to the level of the original data.
<- function() {
getBS # expand out
<- eg.wang2000 |>
exp mutate(
r1 = (rate / 100) * size,
r0 = (1 - (rate / 100)) * size
|>
) select(pop, ethnicity, age_group, r0:r1) |>
pivot_longer(r0:r1, names_to = "r", values_to = "n") |>
mutate(n = as.integer(n)) |>
uncount(n)
# sample for each pop
<- lapply(unique(eg.wang2000$pop), \(p)
bs slice_sample(exp[exp$pop == p, ],
prop = 1, replace = TRUE
|>
) group_by(ethnicity, age_group) |>
reframe(
pop = p,
size = n(),
rate = mean(r == "r1") * 100
|>
) ungroup())
do.call(rbind, bs)
}
Take 500 resamples:
<-
bsamps tibble(
k = 1:500,
i = map(1:500, ~ getBS())
)
and apply the standardisation to each resample:
<- bsamps |>
bsamps mutate(
dgo = map(i, ~ dgnpop(.,
pop = "pop", factors = "rate",
id_vars = c("ethnicity", "age_group"),
crossclassified = "size"
)) )
Here are the original difference effects:
dgnpop(eg.wang2000,
pop = "pop", factors = "rate",
id_vars = c("ethnicity", "age_group"),
crossclassified = "size"
|>
) dg_table()
female male diff decomp
age_group_struct 34.80709 36.73462 1.92753072 68.93
ethnicity_struct 35.79043 35.75128 -0.03914904 -1.40
rate 35.35659 36.26448 0.90788961 32.47
crude 34.59755 37.39382 2.79627129 100.00
And here are the standard errors:
|>
bsamps select(k, dgo) |>
unnest(dgo) |>
group_by(k, factor) |>
reframe(diff = rate[pop == "female"] - rate[pop == "male"]) |>
group_by(factor) |>
reframe(se = sd(diff))
# A tibble: 4 × 2
factor se
<chr> <dbl>
1 age_group_struct 0.313
2 crude 1.36
3 ethnicity_struct 0.338
4 rate 1.34
Which we can take as a proportion of the crude rate difference:
Uncertainty in rates
We can use this same approach to estimate uncertainty in adjusted rates. For instance, using the Scottish Reconvictions data:
data(reconv)
<- reconv |>
src transmute(
pop = year, sex = Sex, age = Age,
rate = prev_rate,
size = offenders,
r1 = rate * size,
r0 = (1 - rate) * size
)
<- function() {
getBS # expand out
<- src |>
exp select(pop, sex, age, r0, r1) |>
pivot_longer(r0:r1, names_to = "r", values_to = "n") |>
mutate(n = as.integer(n)) |>
uncount(n)
# sample for each pop
<- lapply(unique(src$pop), \(p)
bs slice_sample(exp[exp$pop == p, ],
prop = 1, replace = TRUE
|>
) group_by(sex, age) |>
reframe(
pop = p,
size = n(),
rate = mean(r == "r1")
|>
) ungroup())
do.call(rbind, bs)
}
# 500 resamples
<-
bsamps tibble(
k = 1:500,
i = map(1:500, ~ getBS())
)
# apply DG on each resample
<- bsamps |>
bsamps mutate(
dgo = map(i, ~ dgnpop(.,
pop = "pop", factors = "rate",
id_vars = c("age", "sex"),
crossclassified = "size"
)) )