library(DasGuptR)
library(tidyverse)
library(gt)
Category Effects in Das Gupta’s decomposition
For cross-classified data, Chevan & Sutherland 2009 provide a method for decomposing differences in crude rates into differences in each specific category of the sub-population variables in terms of change in what proportion of the population they make up, and changes in sub-population rates.
Below we demonstrate this using eg5.3 from Das Gupta 1993, and then Scottish Reconvictions data.
Example 5.3, Das Gupta 1993
.3 <- data.frame(
eg5race = rep(rep(1:2, e = 11), 2),
age = rep(rep(1:11, 2), 2),
pop = rep(c(1985, 1970), e = 22),
size = c(
3041, 11577, 27450, 32711, 35480, 27411, 19555, 19795, 15254, 8022, 2472,
707, 2692, 6473, 6841, 6547, 4352, 3034, 2540, 1749, 804, 236,
2968, 11484, 34614, 30992, 21983, 20314, 20928, 16897, 11339, 5720, 1315,
535, 2162, 6120, 4781, 3096, 2718, 2363, 1767, 1149, 448, 117
),rate = c(
9.163, 0.462, 0.248, 0.929, 1.084, 1.810, 4.715, 12.187, 27.728, 64.068, 157.570,
17.208, 0.738, 0.328, 1.103, 2.045, 3.724, 8.052, 17.812, 34.128, 68.276, 125.161,
18.469, 0.751, 0.391, 1.146, 1.287, 2.672, 6.636, 15.691, 34.723, 79.763, 176.837,
36.993, 1.352, 0.541, 2.040, 3.523, 6.746, 12.967, 24.471, 45.091, 74.902, 123.205
) )
Here is the population-level decomposition:
dgnpop(eg5.3,
pop = "pop", factors = "rate",
id_vars = c("race", "age"), crossclassified = "size"
|>
) dg_table()
1970 1985 diff decomp
age_struct 8.385332 9.907067 1.5217347 -221.76
race_struct 9.136312 9.156087 0.0197749 -2.88
rate 10.257368 8.029643 -2.2277254 324.64
crude 9.421833 8.735618 -0.6862158 100.00
In order to calculate the influence of changes in specific categories, we need to first get out the sub-population level standardised rates from the decomposition. When we specify crossclassified
argument in dgnpop()
, this will aggregate up to population level unless we also specify agg = FALSE
.
As we are working with vector-factors, this will return rates at the sub-population levels:
<- dgnpop(eg5.3,
res pop = "pop", factors = "rate",
id_vars = c("race", "age"), crossclassified = "size", agg = FALSE
)head(res)
rate pop std.set factor race age
1 0.12507554 1985 1970 rate 1 1
2 0.02421759 1985 1970 rate 1 2
3 0.03531667 1985 1970 rate 1 3
4 0.13427610 1985 1970 rate 1 4
5 0.13900780 1985 1970 rate 1 5
6 0.19410902 1985 1970 rate 1 6
We can take those individual rates, and then calculate the category effects for each group of the sub-population variables — e.g., for each \(i\) of race and \(j\) of age.
In the scenario that we have two cross-classified variables \(I\) and \(J\) and the relevant sub-population rates \(T\), dgnpop()
will return three standardised rates: \(K-I\) standardised, \(K-J\) standardised, and \(K-T\) standardised.
We can see these for each sub-population (reshaped to wide):
|>
res pivot_wider(names_from = factor, values_from = rate)
# A tibble: 44 × 7
pop std.set race age rate race_struct age_struct
<chr> <chr> <int> <int> <dbl> <dbl> <dbl>
1 1985 1970 1 1 0.125 0.185 0.179
2 1985 1970 1 2 0.0242 0.0312 0.0299
3 1985 1970 1 3 0.0353 0.0444 0.0375
4 1985 1970 1 4 0.134 0.147 0.145
5 1985 1970 1 5 0.139 0.150 0.179
6 1985 1970 1 6 0.194 0.237 0.261
7 1985 1970 1 7 0.435 0.514 0.473
8 1985 1970 1 8 1.01 1.14 1.17
9 1985 1970 1 9 1.66 1.85 2.02
10 1985 1970 1 10 1.98 2.19 2.45
# ℹ 34 more rows
To calculate category effects, we aggregate these sub-population rates up to the categories in for each cross-classified variable independently for each population (see Chevan & Sutherland 2009), and the differences between populations provide the category effects.
Race categories:
<- res |>
race_ce pivot_wider(names_from = factor, values_from = rate) |>
group_by(pop, race) |>
summarise(
# sum of P-I std rates
IAi = sum(race_struct),
# sum of P-T std rates / number of crossclass vars
RTi = sum(rate) * (1 / 2),
CEi = IAi + RTi
|>
) group_by(race) |>
summarise(
# category composition effect
comp_ce = IAi[pop == 1985] - IAi[pop == 1970],
# category rate effect
rate_ce = RTi[pop == 1985] - RTi[pop == 1970],
# category total effect
tot_ce = CEi[pop == 1985] - CEi[pop == 1970]
)
Age categories:
<- res |>
age_ce pivot_wider(names_from = factor, values_from = rate) |>
group_by(pop, age) |>
summarise(
# sum of P-J std rates
JBj = sum(age_struct),
# sum of P-T std rates / number of crossclass vars
RTj = sum(rate) * (1 / 2),
CEj = JBj + RTj
|>
) group_by(age) |>
summarise(
# category composition effect
comp_ce = JBj[pop == 1985] - JBj[pop == 1970],
# category rate effect
rate_ce = RTj[pop == 1985] - RTj[pop == 1970],
# category total effect
tot_ce = CEj[pop == 1985] - CEj[pop == 1970]
)
The category-total effects will sum back up to the difference in the crude rates (see top of page)
sum(c(race_ce$tot_ce, age_ce$tot_ce))
[1] -0.6862158
And the category-rate effects will sum back up to the differences in rate-adjusted rates:
sum(c(race_ce$rate_ce, age_ce$rate_ce))
[1] -2.227725
We can simply take these as a percentage of the crude rate differences.
- The category-composition effect (comp_ce below) represents the percentage of the crude rate difference that is due to change in \(\frac{\text{category size}}{\text{total population size}}\).
- The category-rate effect (rate_ce below) represents the percentage of the crude rate difference that is due to change in category-specific rates.
- The category-total effect (tot_ce below) represents the percentage of the crude rate difference that is due to change in both category relative size and category specific rates.
bind_rows(
|> mutate(var = "race") |> rename(category = race),
race_ce |> mutate(var = "age") |> rename(category = age)
age_ce |>
) mutate(
comp_ce = comp_ce / sum(tot_ce) * 100,
rate_ce = rate_ce / sum(tot_ce) * 100,
tot_ce = tot_ce / sum(tot_ce) * 100
|>
) group_by(var) |>
mutate(across(2:4, ~ round(., 2))) |>
gt()
category | comp_ce | rate_ce | tot_ce |
---|---|---|---|
race | |||
1 | 28.91 | 134.37 | 163.27 |
2 | -31.79 | 27.95 | -3.83 |
age | |||
1 | 3.57 | 13.28 | 16.85 |
2 | 0.75 | 1.59 | 2.34 |
3 | 2.87 | 1.93 | 4.80 |
4 | 1.59 | 4.06 | 5.65 |
5 | -10.96 | 4.19 | -6.77 |
6 | -7.54 | 10.21 | 2.67 |
7 | 17.32 | 17.27 | 34.59 |
8 | -4.87 | 25.85 | 20.98 |
9 | -47.47 | 35.64 | -11.83 |
10 | -72.45 | 36.60 | -35.85 |
11 | -104.56 | 11.69 | -92.88 |
Note that these will map to how each category changes between the two populations with respect to a) the proportion of the population that the category constitutes, and b) the average rate. If a category makes up a larger proportion of the population, its category-composition effect will be positive (if the crude rate difference is positive). If the category specific rate changes in the same direction as the crude rate changes, the category-rate effect is positive. The benefit of Chevan & Sutherland’s approach is that these are now scaled proportional to the total crude rate difference.
Changes in category proportions:
.3 |>
eg5group_by(pop, age) |>
summarise(size = sum(size)) |>
group_by(pop) |>
mutate(prop = size / sum(size)) |>
group_by(age) |>
summarise(delta = prop[pop == 1985] - prop[pop == 1970])
# A tibble: 11 × 2
age delta
<int> <dbl>
1 1 -0.00149
2 2 -0.00719
3 3 -0.0578
4 4 -0.00985
5 5 0.0530
6 6 0.0200
7 7 -0.0197
8 8 0.00198
9 9 0.00995
10 10 0.00671
11 11 0.00432
Scottish Reconvictions Data
Data:
data(reconv)
<- reconv |>
srec filter(year %in% c(2004, 2016)) |>
select(year, Sex, Age, offenders, prev_rate)
Population level decomposition:
dgnpop(srec,
pop = "year", factors = c("prev_rate"),
id_vars = c("Age", "Sex"),
crossclassified = "offenders", agg = TRUE
|>
) dg_table()
2004 2016 diff decomp
Age_struct 0.3052555 0.2837824 -0.0214730910 41.31
prev_rate 0.3094771 0.2797243 -0.0297527631 57.23
Sex_struct 0.2948982 0.2941397 -0.0007584511 1.46
crude 0.3237422 0.2717579 -0.0519843051 100.00
Sub-population standardised rates:
<- dgnpop(srec,
res pop = "year", factors = c("prev_rate"),
id_vars = c("Age", "Sex"),
crossclassified = "offenders", agg = FALSE
)
Category effects for Sex and Age:
<- res |>
sex_ce pivot_wider(names_from = factor, values_from = rate) |>
group_by(pop, Sex) |>
summarise(
IAi = sum(Sex_struct),
RTi = sum(prev_rate) * (1 / 2),
CEi = IAi + RTi
|>
) group_by(Sex) |>
summarise(
comp_ce = diff(IAi),
rate_ce = diff(RTi),
tot_ce = diff(CEi)
)
<- res |>
age_ce pivot_wider(names_from = factor, values_from = rate) |>
group_by(pop, Age) |>
summarise(
JBj = sum(Age_struct),
RTj = sum(prev_rate) * (1 / 2),
CEj = JBj + RTj
|>
) group_by(Age) |>
summarise(
comp_ce = diff(JBj),
rate_ce = diff(RTj),
tot_ce = diff(CEj)
)
bind_rows(
|> mutate(var = "sex") |> rename(category = Sex),
sex_ce |> mutate(var = "age") |> rename(category = Age)
age_ce |>
) mutate(
comp_ce = comp_ce / sum(tot_ce) * 100,
rate_ce = rate_ce / sum(tot_ce) * 100,
tot_ce = tot_ce / sum(tot_ce) * 100
|>
) group_by(var) |>
mutate(across(2:4, ~ round(., 2))) |>
gt()
category | comp_ce | rate_ce | tot_ce |
---|---|---|---|
sex | |||
Female | -6.70 | 5.25 | -1.45 |
Male | 8.16 | 23.36 | 31.52 |
age | |||
21 to 25 | 28.00 | 12.73 | 40.73 |
26 to 30 | -11.12 | 8.16 | -2.97 |
31 to 40 | -17.74 | -2.14 | -19.88 |
over 40 | -46.57 | -1.58 | -48.15 |
under 21 | 88.73 | 11.45 | 100.18 |