Utilities for Working with Age Categories

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

ageutils provides a collection of functions for working with age intervals whose underlying implementations have been optimised for performance.

`breaks_to_interval()`

breaks_to_interval provides a categorisation based on specified breaks which represent left-hand interval limits. The resultant groupings span from the minimum break through to a specified max_upper and will always be closed on the left and open on the right. As an example, if breaks = c(0, 1, 10, 30) the interval categories would be [0, 1), [1, 10), [10, 30) and [30, Inf). Ages above max_upper will be returned as NA.

The returned value is as a data frame with 3 entries; A factor with a character representation of the interval and two columns representing the numeric values of the corresponding lower (closed) and upper (open) bounds.

library(ageutils)

breaks_to_interval(breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound
#> 1    [0, 1)           0           1
#> 2    [1, 5)           1           5
#> 3   [5, 15)           5          15
#> 4  [15, 25)          15          25
#> 5  [25, 45)          25          45
#> 6  [45, 65)          45          65
#> 7 [65, Inf)          65         Inf


breaks_to_interval(breaks = c(1L, 5L, 15L), max_upper = 25L)
#>   interval lower_bound upper_bound
#> 1   [1, 5)           1           5
#> 2  [5, 15)           5          15
#> 3 [15, 25)          15          25

`cut_ages()`

cut_ages() provides categorisation of ages based on specified breaks which represent the left-hand interval limits. Categorisation is based on the breaks and follows the approach of breaks_to_interval.

cut_ages(ages = 0:9, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound
#> 1    [0, 1)           0           1
#> 2    [1, 5)           1           5
#> 3    [1, 5)           1           5
#> 4    [1, 5)           1           5
#> 5    [1, 5)           1           5
#> 6   [5, 15)           5          15
#> 7   [5, 15)           5          15
#> 8   [5, 15)           5          15
#> 9   [5, 15)           5          15
#> 10  [5, 15)           5          15


cut_ages(1:10, breaks = c(0L, 4L), max_upper = 9L)
#>    interval lower_bound upper_bound
#> 1    [0, 4)           0           4
#> 2    [0, 4)           0           4
#> 3    [0, 4)           0           4
#> 4    [4, 9)           4           9
#> 5    [4, 9)           4           9
#> 6    [4, 9)           4           9
#> 7    [4, 9)           4           9
#> 8    [4, 9)           4           9
#> 9      <NA>          NA          NA
#> 10     <NA>          NA          NA


x <- cut_ages(1:100, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))

str(x)
#> 'data.frame':	100 obs. of  3 variables:
#>  $ interval   : Ord.factor w/ 7 levels "[0, 1)"<"[1, 5)"<..: 2 2 2 2 3 3 3 3 3 3 ...
#>  $ lower_bound: num  1 1 1 1 5 5 5 5 5 5 ...
#>  $ upper_bound: num  5 5 5 5 15 15 15 15 15 15 ...


head(x$interval)
#> [1] [1, 5)  [1, 5)  [1, 5)  [1, 5)  [5, 15) [5, 15)
#> 7 Levels: [0, 1) < [1, 5) < [5, 15) < [15, 25) < [25, 45) < ... < [65, Inf)

`split_interval_counts()`

split_interval_counts() splits counts within a age interval in to counts for individuals years based on a given weighting. Age intervals are specified by their lower (closed) and upper (open) bounds, i.e. intervals of the form [lower, upper).

# by default counts are split equally across ages within intervals
split_interval_counts(
    lower_bounds = c(0L, 5L, 10L),
    upper_bounds = c(5L, 10L, 20L),
    counts = c(5L, 10L, 30L)
)
#>    age count
#> 1    0     1
#> 2    1     1
#> 3    2     1
#> 4    3     1
#> 5    4     1
#> 6    5     2
#> 7    6     2
#> 8    7     2
#> 9    8     2
#> 10   9     2
#> 11  10     3
#> 12  11     3
#> 13  12     3
#> 14  13     3
#> 15  14     3
#> 16  15     3
#> 17  16     3
#> 18  17     3
#> 19  18     3
#> 20  19     3


# Population weightings to apply for individual years can be specified by
# the weights argument. If these are specified, they must be of length
# `max_upper` and represent weights in the range 0:(max_upper - 1).
max_upper <- 20L
weights <- integer(max_upper)
weights[c(TRUE, FALSE)] <- 1L
split_interval_counts(
    lower_bounds = c(0L, 5L, 10L),
    upper_bounds = c(5L, 10L, 20L),
    counts = c(5L, 10L, 30L),
    max_upper = max_upper,
    weights <- weights
)
#>    age    count
#> 1    0 1.666667
#> 2    1 0.000000
#> 3    2 1.666667
#> 4    3 0.000000
#> 5    4 1.666667
#> 6    5 0.000000
#> 7    6 5.000000
#> 8    7 0.000000
#> 9    8 5.000000
#> 10   9 0.000000
#> 11  10 6.000000
#> 12  11 0.000000
#> 13  12 6.000000
#> 14  13 0.000000
#> 15  14 6.000000
#> 16  15 0.000000
#> 17  16 6.000000
#> 18  17 0.000000
#> 19  18 6.000000
#> 20  19 0.000000

`aggregate_age_counts()`

aggregate_age_counts() provides aggregation of counts across ages (in years). It is similar to a cut() and tapply() pattern but optimised for speed over flexibility. Groupings are the same as in cut_ages() and counts will be provided across all natural numbers as well as for missing values.

# default ages generated as 0:(length(counts) - 1L) if only counts provided.
aggregate_age_counts(counts = 1:65, breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L))
#>    interval lower_bound upper_bound count
#> 1    [0, 1)           0           1     1
#> 2    [1, 5)           1           5    14
#> 3   [5, 15)           5          15   105
#> 4  [15, 25)          15          25   205
#> 5  [25, 45)          25          45   710
#> 6  [45, 65)          45          65  1110
#> 7 [65, Inf)          65         Inf     0


# NA ages are also handled with their own grouping
ages <- 1:65
ages[1:44] <- NA
aggregate_age_counts(
    counts = 1:65,
    ages = ages,
    breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L)
)
#>    interval lower_bound upper_bound count
#> 1    [0, 1)           0           1     0
#> 2    [1, 5)           1           5     0
#> 3   [5, 15)           5          15     0
#> 4  [15, 25)          15          25     0
#> 5  [25, 45)          25          45     0
#> 6  [45, 65)          45          65  1090
#> 7 [65, Inf)          65         Inf    65
#> 8      <NA>          NA          NA   990

`reaggregate_interval_counts()`

reaggregate_interval_counts() is equivalent to, but more efficient than a call to to split_interval_counts() followed by aggregate_age_counts().

The example below shows how it can be used to redistribute counts across a desired set of age intervals. We use data included in the package that has been obtained from the 2021 census and modify this based on our desired interval limits.

# census data
data(pop_dat)
pop_dat
#>    area_code         area_name age_category   value
#> 1  K04000001 England and Wales       [0, 5) 3232100
#> 2  K04000001 England and Wales      [5, 10) 3524600
#> 3  K04000001 England and Wales     [10, 15) 3595900
#> 4  K04000001 England and Wales     [15, 20) 3394700
#> 5  K04000001 England and Wales     [20, 25) 3602100
#> 6  K04000001 England and Wales     [25, 30) 3901800
#> 7  K04000001 England and Wales     [30, 35) 4148800
#> 8  K04000001 England and Wales     [35, 40) 3981600
#> 9  K04000001 England and Wales     [40, 45) 3755700
#> 10 K04000001 England and Wales     [45, 50) 3788700
#> 11 K04000001 England and Wales     [50, 55) 4123400
#> 12 K04000001 England and Wales     [55, 60) 4029000
#> 13 K04000001 England and Wales     [60, 65) 3455700
#> 14 K04000001 England and Wales     [65, 70) 2945100
#> 15 K04000001 England and Wales     [70, 75) 2978000
#> 16 K04000001 England and Wales     [75, 80) 2170300
#> 17 K04000001 England and Wales     [80, 85) 1517000
#> 18 K04000001 England and Wales     [85, 90)  925100
#> 19 K04000001 England and Wales    [90, Inf)  527900


# each row is for the same region so discard for moment
dat <- subset(pop_dat, select = c(age_category, value))

# extract upper and lower bounds
dat <- transform(
    dat,
    lower_bound = as.numeric(sub("\\[([0-9]+), .+)", "\\1", age_category)),
    upper_bound = as.numeric(sub(".+, (.+))", "\\1", age_category))
)

head(dat, n=10)
#>    age_category   value lower_bound upper_bound
#> 1        [0, 5) 3232100           0           5
#> 2       [5, 10) 3524600           5          10
#> 3      [10, 15) 3595900          10          15
#> 4      [15, 20) 3394700          15          20
#> 5      [20, 25) 3602100          20          25
#> 6      [25, 30) 3901800          25          30
#> 7      [30, 35) 4148800          30          35
#> 8      [35, 40) 3981600          35          40
#> 9      [40, 45) 3755700          40          45
#> 10     [45, 50) 3788700          45          50


# recategorise based on ages
with(
    dat,
    reaggregate_interval_counts(
        lower_bounds = lower_bound,
        upper_bounds = upper_bound,
        counts = value,
        breaks = c(0L, 1L, 5L, 15L, 25L, 45L, 65L),
        max_upper = 100L,
        weights = NULL
    )
)
#> Warning in reaggregate_interval_counts(lower_bounds = lower_bound, upper_bounds
#> = upper_bound, : `upper_bounds` greater than `max_upper` (100) have been
#> replaced prior to splitting.
#>    interval lower_bound upper_bound    count
#> 1    [0, 1)           0           1   646420
#> 2    [1, 5)           1           5  2585680
#> 3   [5, 15)           5          15  7120500
#> 4  [15, 25)          15          25  6996800
#> 5  [25, 45)          25          45 15787900
#> 6  [45, 65)          45          65 15396800
#> 7 [65, Inf)          65         Inf 11063400

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.