The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
The epidm R package provides utilities to support patient identity resolution and clinical episode construction.
This vignette introduces two key functions:
uk_patient_id()
Identifying patients and generating unique patient ids using known
patient identifiers with multiple staged rules.
group_time()
Groups time intervals into clinically meaningful episodes (static or
rolling windows).
Both functions are designed for healthcare data, for example SGSS, HES/SUS data and ECDS data where repeated records across systems must be linked or aggregated.
uk_patient_id() functionA dataset should ideally contain:
You must also supply a list mapping your column
names using the id = list(...) argument.
# 1) Create example data
id_test <-
data.frame(
stringsAsFactors = FALSE,
record_id = c(1L,2L,3L,4L,
5L,6L,7L,8L,9L,10L,11L,12L,13L,14L,15L,
16L,17L,18L,19L,20L,21L,22L,23L,24L),
nhs_number = c(9435754422,
9435754422,NA,9435754422,5555555555,NA,
9435773982,NA,9999999999,NA,9435773982,NA,
9435802508,9435802508,NA,NA,9435802508,9435802508,NA,
3333333333,NA,9999999999,9435817777,
9435817777),
local_patient_identifier = c(NA,"IG12067",
NA,NA,"IG12067","IG12067","KR2535","KR2535",
"KR2535",NA,NA,NA,"UK8734","UK8734",NA,NA,
"UK8734","UK8734",NA,NA,"JH45204",
"HS45202","HS45202","JH45204"),
patient_birth_date = c("1993-07-16",
"1993-07-16","1993-07-16","1993-07-16",
"1993-07-16",NA,"1967-02-10",NA,"1967-02-10",NA,NA,
"1967-02-10",NA,NA,"1952-10-22","1952-10-22",
"1952-10-22",NA,"1947-09-14","1947-09-14",
"1947-09-14","1947-09-14","1947-09-14",
"1947-09-14"),
sex = c("Male","Male",
"Male","Male",NA,"Male","Female","Female",
"Female","Female","Female","Female","Male",
"Male","Male","Male","Male","Male","Male",
"Male","Male","Male",NA,"Male"),
forename = c(NA,"DENNIS",
NA,NA,"DENNIS",NA,"ELLIE","ELLIE",NA,
"ELLIE","ELLIE","ELLIE","IAN","IAN","MALCOLM",
"IAN","IAN",NA,"GRANT","ALAN","ALAN","ALAN",
"GRANT","ALAN"),
surname = c(NA,"NEDRY",
"NEDRY",NA,"NEDRY","NEDRY","SATTLER","SATTLER",
NA,"SATTLER","SATTLER","SATTLER","M",NA,
"IAN","MALCOLM","MALCOLM",NA,"ALAN","GRANT",
"GRANT","GRANT","ALAN","GRANT"),
postcode = c("HA4 0FF",
"HA4 0FF","HA4 0FF",NA,"HA4 0FF","HA4 0FF",
"L3 1DZ","L3 1DZ","L3 1DZ","L3 1DZ",NA,"L3 1DZ",
"BN14 9EP",NA,"BN14 9EP",NA,NA,NA,"CW6 9TX",
"CW6 9TX",NA,NA,NA,NA),
specimen_date = c("2024-08-14",
"2023-02-03","2023-02-07","2023-02-04",
"2023-02-09","2024-08-14","2021-03-28","2021-03-28",
"2021-03-28","2021-03-28","2021-03-28",
"2021-03-28","2024-07-06","2024-07-06","2024-07-06",
"2023-10-31","2023-10-31","2023-10-31",
"2022-01-23","2022-01-24","2022-01-25","2022-01-26",
"2022-01-27","2022-01-28")
)
# 2) Run uk_patient_id()
result_id <- uk_patient_id(
id_test,
id = list(
nhs_number = 'nhs_number',
hospital_number = 'local_patient_identifier',
date_of_birth = 'patient_birth_date',
sex_mfu = 'sex',
forename = 'forename',
surname = 'surname',
postcode = 'postcode'
),
.useStages = 1:11, # optional
.keepStages = FALSE, # optional
.keepValidNHS = FALSE # optional
)
# 3) Show a preview
print(head(result_id), row.names = FALSE)
#> id record_id nhs_number local_patient_identifier patient_birth_date sex
#> <int> <int> <char> <char> <char> <char>
#> 1 1 9435754422 <NA> 1993-07-16 M
#> 1 2 9435754422 IG12067 1993-07-16 M
#> 1 3 <NA> <NA> 1993-07-16 M
#> 1 4 9435754422 <NA> 1993-07-16 M
#> 1 5 5555555555 IG12067 1993-07-16 <NA>
#> 1 6 <NA> IG12067 <NA> M
#> forename surname postcode specimen_date
#> <char> <char> <char> <char>
#> <NA> <NA> HA40FF 2024-08-14
#> DENNIS NEDRY HA40FF 2023-02-03
#> <NA> NEDRY HA40FF 2023-02-07
#> <NA> <NA> <NA> 2023-02-04
#> DENNIS NEDRY HA40FF 2023-02-09
#> <NA> NEDRY HA40FF 2024-08-14group_time() functionA dataset should ideally contain:
You must also supply the grouping and date arguments via function parameters:
group_time() aggregates:
Intervals (start + end dates)
e.g., hospital spells (HES/SUS)
Events (single date)
e.g., microbiology specimen dates
Two episode rules are supported:
| Type | Meaning |
|---|---|
| Static window | Based on the first event only; all events within X days of the first event belong to the same episode. |
| Rolling window | The window resets with each event; a new event inside X days of the previous event extends the episode. |
# Events example (14‑day static window):
#1) Create example data
episode_test <- structure(
list(
pat_id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L),
species = c(rep("E. coli", 7), rep("K. pneumonia", 7)),
spec_type = c(rep("Blood", 7), rep("Blood", 4), rep("Sputum", 3)),
sp_date = structure(
c(
18262,
18263,
18281,
18282,
18262,
18263,
18281,
18265,
18270,
18281,
18283,
18259,
18260,
18281
),
class = "Date"
)
),
row.names = c(NA, -14L),
class = "data.frame"
)
# 2) Run group_time() for events using a 14-day static window
ep_static <- group_time(
x = episode_test,
date_start = 'sp_date',
window = 14,
window_type = 'static',
group_vars = c('pat_id', 'species', 'spec_type'),
indx_varname = 'static_indx', # optional
min_varname = 'min_date',
# optional (defaults)
max_varname = 'max_date' # optional (defaults)
)
# 3) Show a preview
print(head(ep_static), row.names = FALSE)
#> pat_id species spec_type sp_date static_indx min_date max_date
#> <int> <char> <char> <Date> <char> <Date> <Date>
#> 1 E. coli Blood 2020-01-01 1.4.1 2020-01-01 2020-01-02
#> 1 E. coli Blood 2020-01-02 1.4.1 2020-01-01 2020-01-02
#> 1 E. coli Blood 2020-01-20 1.4.2 2020-01-20 2020-01-21
#> 1 E. coli Blood 2020-01-21 1.4.2 2020-01-20 2020-01-21
#> 1 K. pneumonia Blood 2020-01-04 2.4.1 2020-01-04 2020-01-09
#> 1 K. pneumonia Blood 2020-01-09 2.4.1 2020-01-04 2020-01-09
# Intervals example (start + end dates):1) Create example interval data (start + end dates)
#1) Create example data
spell_test <- data.frame(
id = c(rep(99, 6), rep(88, 4), rep(3, 3)),
provider = c("YXZ", rep("ZXY", 5), rep("XYZ", 4), rep("YZX", 3)),
spell_start = as.Date(
c(
"2020-03-01",
"2020-07-07",
"2020-02-08",
"2020-04-28",
"2020-03-15",
"2020-07-01",
"2020-01-01",
"2020-01-12",
"2019-12-25",
"2020-03-28",
"2020-01-01",
NA,
NA
)
),
spell_end = as.Date(
c(
"2020-03-10",
"2020-07-26",
"2020-05-22",
"2020-04-30",
"2020-05-20",
"2020-07-08",
"2020-01-23",
"2020-03-30",
"2020-01-02",
"2020-04-20",
"2020-01-01",
NA,
NA
)
)
)
# 2) Run group_time() for intervals (start + end dates)
spell_episodes <- group_time(
x = spell_test,
date_start = 'spell_start',
date_end = 'spell_end',
group_vars = c('id', 'provider'),
indx_varname = 'spell_id',
# optional
min_varname = 'spell_min_date',
# optional
max_varname = 'spell_max_date' # optional
)
# 3) Show a preview
print(head(spell_episodes), row.names = FALSE)
#> id provider spell_start spell_end spell_id spell_min_date spell_max_date
#> <num> <char> <Date> <Date> <char> <Date> <Date>
#> 88 XYZ 2019-12-25 2020-01-02 1.4.0 2019-12-25 2020-04-20
#> 88 XYZ 2020-01-01 2020-01-23 1.4.0 2019-12-25 2020-04-20
#> 3 YZX 2020-01-01 2020-01-01 2.1.0 2020-01-01 2020-01-01
#> 88 XYZ 2020-01-12 2020-03-30 1.4.0 2019-12-25 2020-04-20
#> 99 ZXY 2020-02-08 2020-05-22 3.5.0 2020-02-08 2020-05-22
#> 99 YXZ 2020-03-01 2020-03-10 4.1.0 2020-03-01 2020-03-10# Example data generation
# Helper to make plausible 10-digit NHS-like strings
mk_nhs <- function(n) {
apply(matrix(sample(0:9, n * 10, replace = TRUE), nrow = n, byrow = TRUE),
1, paste0, collapse = "")
}
# A small "people" frame to borrow shared attributes from
persons <- tibble::tibble(
id_person = 1:6,
nhsnumber = c(mk_nhs(5), NA_character_), # include one missing NHS
forename = c("John", "Jane", "Sam", "Aisha", "Maya", "John"),
surname = c("Smith", "Doe", "Patel", "Khan", "Brown", "Smyth"), # one spelling variant
date_of_birth = as.Date(c("1980-03-14","1991-11-02","1985-07-28","2002-01-09","2010-05-30","1980-03-14")),
sex = c("M","F","M","F","U","M"),
postcode = c("SW1A 1AA","E1 6AN","B1 1AA","M1 1AE","CF10 1EP","SW1A1AA") # one without space
)
# --- SGSS-like data: multiple specimens per person, some within 30 days ------
sgss_data <- persons |>
# duplicate person 1 (two specimens), and include others
slice(c(1, 1, 2, 3, 4, 5, 6)) |>
mutate(
CDR_OPIE_ID = row_number() + 1000L,
earliest_specimen_date = as.Date("2023-10-01") + c(0, 10, 5, 15, 40, 3, 2),
GROUP_A_STREP_PCR = c("Detected","Not detected","Detected","Detected","Not detected","Detected","Detected"),
third = c("emm1","emm1","emm3","emm12","",NA,"emm89")
) |>
select(
CDR_OPIE_ID, nhsnumber, forename, surname, date_of_birth, sex,
postcode, earliest_specimen_date, GROUP_A_STREP_PCR, third
)
# --- CIMS-like data: case notifications, some within 30 days of SGSS ----------
cims_data <- persons |>
# person 3 appears twice, person 1 appears once etc.
slice(c(1, 2, 3, 3, 4, 6)) %>%
mutate(
Case_identifier = row_number() + 2000L,
Date_entered = as.Date("2023-10-05") + c(0, 6, 18, 50, 35, 1),
Infection = c("iGAS","Scarlet fever","iGAS","iGAS","Scarlet fever","iGAS")
) |>
select(
Case_identifier, nhsnumber, forename, surname, date_of_birth,
sex, postcode, Date_entered, Infection
)
# Example start:
# Example of importing SGSS and CIMS data ready for linkage
lnk.data_sgss <- sgss_data
lnk.data_cims <- cims_data
# Update SGSS column classes so they match with CIMS
lnk.data_sgss <- lnk.data_sgss |>
mutate(
date_of_birth = as.Date(date_of_birth),
forename = as.character(forename),
surname = as.character(surname),
postcode = as.character(postcode),
patient_demog_sex = as.character(sex),
earliest_specimen_date = as.Date(earliest_specimen_date),
nhsnumber = as.character(nhsnumber)
)
# Update CIMS column classes so they match SGSS
lnk.data_cims <- lnk.data_cims |>
mutate(
date_of_birth = as.Date(date_of_birth),
forename = as.character(forename),
surname = as.character(surname),
postcode = as.character(postcode),
sex = as.character(sex),
Date_entered = as.Date(Date_entered))
# Convert data.frame to data.table so can be fed into function
lnk.dt_cims <- data.table::setDT(lnk.data_cims)
lnk.dt_sgss <- data.table::setDT(lnk.data_sgss)
# Stack the two data sets
lnk.dt_combined <-
bind_rows(
mutate(lnk.dt_cims, data_source = "dt_cims"),
mutate(lnk.dt_sgss, data_source = "dt_sgss")
)
# Add common date field to be used during deduplication
lnk.dt_combined <- lnk.dt_combined |>
mutate(common_date = (coalesce(earliest_specimen_date, Date_entered)))
# List of id fields to be used during normalisation of id fields
# This is a parameter that is fed into the uk_patient_id() function
id = list(
nhs_number = 'nhsnumber',
date_of_birth = 'date_of_birth',
sex_mfu = 'sex',
forename = 'forename',
surname = 'surname',
postcode = 'postcode'
)
# Feeding combined SGSS and CIMS data into uk_patient_id() function to get
# unique patient identifiers
lnk.dt_combined_norm <- epidm::uk_patient_id(
lnk.dt_combined,
id,
.useStages = c(1:6),
.sortOrder = 'common_date',
.forceCopy = TRUE,
.keepValidNHS = FALSE,
.keepStages = TRUE
)
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
#> Warning in .f(.x[[i]], ...): NHS number is missing or empty
# Group records by `id` into rolling 30-day windows based on `common_date`,
# using `dedupe_key` as the unique row index; the trailing [] forces evaluation and returns a data.table
lnk.dt_combined_grouped <- epidm::group_time(
x = lnk.dt_combined_norm,
date_start = 'common_date',
window = 30,
window_type = 'rolling',
indx_varname = 'dedup_key',
group_vars = c(
"id"
)
)[]
# Filter to pull out just CIMS data
lnk.grouped_cims <- lnk.dt_combined_grouped |>
filter(data_source == "dt_cims") |>
select (
id,
Case_identifier,
date_of_birth,
forename,
surname,
nhsnumber,
postcode,
Infection,
sex,
common_date,
data_source,
dedup_key
)
# Filter to pull out just SGSS data
lnk.grouped_sgss <- lnk.dt_combined_grouped |>
filter(data_source == "dt_sgss") |>
select (
CDR_OPIE_ID,
id,
date_of_birth,
forename,
surname,
nhsnumber,
postcode,
GROUP_A_STREP_PCR,
common_date,
data_source,
dedup_key,
third
)
# Taking one row from each split - gives one episode per dataset
lnk.grouped_cims_deduped <- lnk.grouped_cims |>
group_by(dedup_key) |>
slice(1) |>
ungroup()
# De-duplicating SGSS- prioritising earliest emm typing with a relevant result
lnk.grouped_sgss_deduped <- lnk.grouped_sgss |>
group_by(dedup_key) |>
arrange(dedup_key, common_date) |>
slice(1) |>
ungroup()
# Join splits by dedup key- common field names with .x refer to CIMS and .y for SGSS
lnk.split_join_cims_sgss <-
full_join(lnk.grouped_cims_deduped, lnk.grouped_sgss_deduped, by = "dedup_key")These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.