The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Example 1: Simulated Data

library(PheCAP)

set.seed(123)

Generate simulated data.

latent <- rgamma(8000, 0.3)
latent2 <- rgamma(8000, 0.3)
ehr_data <- data.frame(
  patient_id = 1:8000,
  ICD1 = rpois(8000, 7 * (rgamma(8000, 0.2) + latent) / 0.5),
  ICD2 = rpois(8000, 6 * (rgamma(8000, 0.8) + latent) / 1.1),
  ICD3 = rpois(8000, 1 * rgamma(8000, 0.5 + latent2) / 0.5),
  ICD4 = rpois(8000, 2 * rgamma(8000, 0.5) / 0.5),
  NLP1 = rpois(8000, 8 * (rgamma(8000, 0.2) + latent) / 0.6),
  NLP2 = rpois(8000, 2 * (rgamma(8000, 1.1) + latent) / 1.5),
  NLP3 = rpois(8000, 5 * (rgamma(8000, 0.1) + latent) / 0.5),
  NLP4 = rpois(8000, 11 * rgamma(8000, 1.9 + latent) / 1.9),
  NLP5 = rpois(8000, 3 * rgamma(8000, 0.5 + latent2) / 0.5),
  NLP6 = rpois(8000, 2 * rgamma(8000, 0.5) / 0.5),
  NLP7 = rpois(8000, 1 * rgamma(8000, 0.5) / 0.5),
  HU = rpois(8000, 30 * rgamma(8000, 0.1) / 0.1),
  label = NA)
ii <- sample.int(8000, 400)
ehr_data[ii, "label"] <- with(
  ehr_data[ii, ], rbinom(400, 1, plogis(
    -5 + 1.5 * log1p(ICD1) + log1p(NLP1) +
      0.8 * log1p(NLP3) - 0.5 * log1p(HU))))
head(ehr_data)

##   patient_id ICD1 ICD2 ICD3 ICD4 NLP1 NLP2 NLP3 NLP4 NLP5 NLP6 NLP7  HU label
## 1          1    2   11    3    0    0    2    2    2    1    7    0   4    NA
## 2          2    5   11    3    1    5    0    1   11    5    0    0   0    NA
## 3          3   17   10    0    2   20    2   16   10    0    0    2   0    NA
## 4          4    0    5    0    0    0    0    0   15    3    3    7   0    NA
## 5          5    0    5    1    0    0    3    7    6   31    0    3 146    NA
## 6          6    4   11    4    0   13    2    3    4    2    3    0   0    NA

tail(ehr_data)

##      patient_id ICD1 ICD2 ICD3 ICD4 NLP1 NLP2 NLP3 NLP4 NLP5 NLP6 NLP7 HU label
## 7995       7995    9   23    0    7   11    4    1   17    4    0    0  1    NA
## 7996       7996   21    7    9    0    6    1    1    6    4    1    2 11    NA
## 7997       7997   26    0    0    1    0    1    0   10    3    0    3  0    NA
## 7998       7998    3    0    5    1   34    1    5   21    4    4    1 46     0
## 7999       7999    0    0    0    0    4    0    0    5    0    0    5  0    NA
## 8000       8000    0    4    0    0    0   14    1    3    0    4    2  9    NA

Define features and labels used for phenotyping.

data <- PhecapData(ehr_data, "HU", "label", 0.4, patient_id = "patient_id")
data

## PheCAP Data
## Feature: 8000 observations of 12 variables
## Label: 132 yes, 268 no, 7600 missing
## Size of training samples: 240
## Size of validation samples: 160

Specify the surrogate used for surrogate-assisted feature extraction (SAFE). The typical way is to specify a main ICD code, a main NLP CUI, as well as their combination. The default lower_cutoff is 1, and the default upper_cutoff is 10. In some cases one may want to define surrogate through lab test. Feel free to change the cutoffs based on domain knowledge.

surrogates <- list(
  PhecapSurrogate(
    variable_names = "ICD1",
    lower_cutoff = 1, upper_cutoff = 10),
  PhecapSurrogate(
    variable_names = "NLP1",
    lower_cutoff = 1, upper_cutoff = 10))

Run surrogate-assisted feature extraction (SAFE) and show result.

system.time(feature_selected <- phecap_run_feature_extraction(data, surrogates))

##    user  system elapsed 
##   9.255   0.000   9.275

feature_selected

## Feature(s) selected by surrogate-assisted feature extraction (SAFE)
## [1] "ICD1" "ICD2" "NLP1" "NLP2" "NLP3"

Train phenotyping model and show the fitted model, with the AUC on the training set as well as random splits.

suppressWarnings(model <- phecap_train_phenotyping_model(data, surrogates, feature_selected))
model

## Phenotyping model:
## $lasso_bic
## (Intercept)        ICD1        NLP1          HU        ICD2        NLP2 
##  -5.3901962   1.9297295   1.2402141  -0.4471979   0.0000000   0.0000000 
##        NLP3 
##   0.9516273 
## 
## AUC on training data: 0.95 
## Average AUC on random splits: 0.938

Validate phenotyping model using validation label, and show the AUC and ROC.

validation <- phecap_validate_phenotyping_model(data, model)

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values

validation

## AUC on validation data: 0.921 
## AUC on training data: 0.95 
## Average AUC on random splits: 0.938

round(validation$valid_roc[validation$valid_roc[, "FPR"] <= 0.2, ], 3)

##       cutoff pos.rate   FPR   TPR   PPV   NPV    F1
##  [1,]  0.999    0.003 0.000 0.061 1.000 0.701 0.115
##  [2,]  0.969    0.100 0.009 0.308 0.939 0.759 0.464
##  [3,]  0.854    0.175 0.018 0.490 0.925 0.809 0.641
##  [4,]  0.680    0.194 0.027 0.584 0.907 0.837 0.710
##  [5,]  0.632    0.225 0.036 0.636 0.888 0.853 0.741
##  [6,]  0.596    0.253 0.050 0.700 0.864 0.874 0.773
##  [7,]  0.503    0.262 0.064 0.700 0.833 0.873 0.761
##  [8,]  0.461    0.269 0.073 0.700 0.814 0.872 0.753
##  [9,]  0.430    0.275 0.082 0.700 0.795 0.871 0.745
## [10,]  0.408    0.281 0.091 0.700 0.778 0.870 0.737
## [11,]  0.382    0.291 0.100 0.710 0.763 0.872 0.736
## [12,]  0.341    0.300 0.109 0.720 0.750 0.875 0.735
## [13,]  0.320    0.312 0.118 0.734 0.738 0.879 0.736
## [14,]  0.297    0.331 0.127 0.772 0.734 0.894 0.752
## [15,]  0.291    0.344 0.136 0.806 0.729 0.907 0.765
## [16,]  0.290    0.359 0.150 0.820 0.713 0.912 0.763
## [17,]  0.285    0.369 0.164 0.820 0.695 0.911 0.752
## [18,]  0.255    0.375 0.173 0.824 0.684 0.912 0.748
## [19,]  0.234    0.388 0.182 0.846 0.679 0.921 0.753
## [20,]  0.215    0.400 0.191 0.868 0.674 0.931 0.759
## [21,]  0.201    0.412 0.200 0.880 0.667 0.936 0.759

phecap_plot_roc_curves(validation)

## Warning: replacing previous import 'vctrs::data_frame' by 'tibble::data_frame'
## when loading 'dplyr'

## Warning: Use of `df$value_x` is discouraged. Use `value_x` instead.

## Warning: Use of `df$value_y` is discouraged. Use `value_y` instead.

Apply the model to all the patients to obtain predicted phenotype.

phenotype <- phecap_predict_phenotype(data, model)
idx <- which.min(abs(validation$valid_roc[, "FPR"] - 0.05))
cut.fpr95 <- validation$valid_roc[idx, "cutoff"]
case_status <- ifelse(phenotype$prediction >= cut.fpr95, 1, 0)
predict.table <- cbind(phenotype, case_status)
predict.table[1:10, ]

##    patient_id  prediction case_status
## 1           1 0.034377662           0
## 2           2 0.433139377           0
## 3           3 0.990857331           1
## 4           4 0.004233424           0
## 5           5 0.003442046           0
## 6           6 0.702835815           1
## 7           7 0.843285354           1
## 8           8 0.001620771           0
## 9           9 0.913644435           1
## 10         10 0.044409875           0

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.