The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
The goal of is this vignette is to illustrate the cases functionality by means of a real-world example focused on biomarker assessment and prediction model evaluation.
Load the package:
## load packages:
library(readr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(splitstackshape)
library(glmnet)
#> Loading required package: Matrix
#> Loaded glmnet 4.1-4
library(cases)
We utilize the breast cancer wisconsin (diagnostic) data set.
?data_wdbc<- data_wdbc data
dim(data)
#> [1] 569 31
table(data$diagnosis)
#>
#> 0 1
#> 357 212
## Missing values?
colSums(is.na(data)) # -> no missing values
#> diagnosis radius_mean texture_mean
#> 0 0 0
#> perimeter_mean area_mean smoothness_mean
#> 0 0 0
#> compactness_mean concavity_mean concave_points_mean
#> 0 0 0
#> symmetry_mean fractal_dimension_mean radius_sd
#> 0 0 0
#> texture_sd perimeter_sd area_sd
#> 0 0 0
#> smoothness_sd compactness_sd concavity_sd
#> 0 0 0
#> concave_points_sd symmetry_sd fractal_dimension_sd
#> 0 0 0
#> radius_peak texture_peak perimeter_peak
#> 0 0 0
#> area_peak smoothness_peak compactness_peak
#> 0 0 0
#> concavity_peak concave_points_peak symmetry_peak
#> 0 0 0
#> fractal_dimension_peak
#> 0
summary(data)
#> diagnosis radius_mean texture_mean perimeter_mean area_mean
#> 0:357 Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
#> 1:212 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
#> Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
#> Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
#> 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
#> Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
#> smoothness_mean compactness_mean concavity_mean concave_points_mean
#> Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
#> 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
#> Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
#> Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
#> 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
#> Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
#> symmetry_mean fractal_dimension_mean radius_sd texture_sd
#> Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3602
#> 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339
#> Median :0.1792 Median :0.06154 Median :0.3242 Median :1.1080
#> Mean :0.1812 Mean :0.06280 Mean :0.4052 Mean :1.2169
#> 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740
#> Max. :0.3040 Max. :0.09744 Max. :2.8730 Max. :4.8850
#> perimeter_sd area_sd smoothness_sd compactness_sd
#> Min. : 0.757 Min. : 6.802 Min. :0.001713 Min. :0.002252
#> 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080
#> Median : 2.287 Median : 24.530 Median :0.006380 Median :0.020450
#> Mean : 2.866 Mean : 40.337 Mean :0.007041 Mean :0.025478
#> 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450
#> Max. :21.980 Max. :542.200 Max. :0.031130 Max. :0.135400
#> concavity_sd concave_points_sd symmetry_sd fractal_dimension_sd
#> Min. :0.00000 Min. :0.000000 Min. :0.007882 Min. :0.0008948
#> 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
#> Median :0.02589 Median :0.010930 Median :0.018730 Median :0.0031870
#> Mean :0.03189 Mean :0.011796 Mean :0.020542 Mean :0.0037949
#> 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
#> Max. :0.39600 Max. :0.052790 Max. :0.078950 Max. :0.0298400
#> radius_peak texture_peak perimeter_peak area_peak
#> Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
#> 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
#> Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
#> Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
#> 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
#> Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
#> smoothness_peak compactness_peak concavity_peak concave_points_peak
#> Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
#> 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
#> Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
#> Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
#> 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
#> Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
#> symmetry_peak fractal_dimension_peak
#> Min. :0.1565 Min. :0.05504
#> 1st Qu.:0.2504 1st Qu.:0.07146
#> Median :0.2822 Median :0.08004
#> Mean :0.2901 Mean :0.08395
#> 3rd Qu.:0.3179 3rd Qu.:0.09208
#> Max. :0.6638 Max. :0.20750
## define minimal acceptable criteria for specificity, sensitivity:
<- 0.7
sp0 <- 0.9
se0 <- c(sp0, se0) benchmark
<- seq(0,1, 0.1)
pr
quantile(data$area_peak, pr) # 500, 600, 700, 800, 900 ---> area
#> 0% 10% 20% 30% 40% 50% 60% 70% 80% 90%
#> 185.20 384.72 475.98 544.14 599.70 686.50 781.18 926.96 1269.00 1673.00
#> 100%
#> 4254.00
quantile(data$compactness_peak, pr) # 0.10, 0.15, 0.20, 0.25, 0.30 ---> compactness (perimeter^2 / area - 1.0)
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.027290 0.093676 0.125660 0.161400 0.184620 0.211900 0.251400 0.303960
#> 80% 90% 100%
#> 0.367060 0.447840 1.058000
quantile(data$concavity_peak, pr) # 0.10, 0.15, 0.20, 0.25, 0.30 ---> concavity (severity of concave portions of the contour)
#> 0% 10% 20% 30% 40% 50% 60% 70%
#> 0.000000 0.045652 0.091974 0.136880 0.177180 0.226700 0.286600 0.349920
#> 80% 90% 100%
#> 0.419540 0.571320 1.252000
<- c(500, 600, 700, 800, 900,
cc 0.10, 0.15, 0.20, 0.25, 0.30,
0.10, 0.15, 0.20, 0.25, 0.30)
<- data %>%
comp_bm ::select(area_peak, compactness_peak, concavity_peak) %>%
dplyr::categorize(cc, rep(1:3, each=5)) %>%
cases::compare(labels = as.numeric(as.character(data$diagnosis)))
cases
<- cases::evaluate(comp_bm, benchmark = benchmark,
results_bm alternative = "greater", alpha = 0.025,
adj="boot", regu=1)
#> Drawing 2000 'pairs' bootstrap samples...
results_bm#> [cases] evaluation results:
#> $specificity
#> parameter hypothesis estimate lower upper pval pval_all reject
#> 1 area_peak_500 <= 0.7 0.3645 0.2911 Inf 1.000 1.0000 FALSE
#> 2 area_peak_600 <= 0.7 0.6243 0.5505 Inf 1.000 1.0000 FALSE
#> 3 area_peak_700 <= 0.7 0.8059 0.7456 Inf 0.000 0.0010 TRUE
#> 4 area_peak_800 <= 0.7 0.9064 0.8620 Inf 0.000 1.0000 TRUE
#> 5 area_peak_900 <= 0.7 0.9763 0.9530 Inf 0.000 1.0000 TRUE
#> 6 compactness_peak_0.1 <= 0.7 0.1913 0.1314 Inf 1.000 1.0000 FALSE
#> 7 compactness_peak_0.15 <= 0.7 0.3980 0.3234 Inf 1.000 1.0000 FALSE
#> 8 compactness_peak_0.2 <= 0.7 0.6466 0.5738 Inf 1.000 1.0000 FALSE
#> 9 compactness_peak_0.25 <= 0.7 0.7975 0.7362 Inf 0.001 1.0000 TRUE
#> 10 compactness_peak_0.3 <= 0.7 0.8925 0.8452 Inf 0.000 1.0000 TRUE
#> 11 concavity_peak_0.1 <= 0.7 0.3422 0.2698 Inf 1.000 1.0000 FALSE
#> 12 concavity_peak_0.15 <= 0.7 0.5321 0.4560 Inf 1.000 1.0000 FALSE
#> 13 concavity_peak_0.2 <= 0.7 0.7221 0.6538 Inf 0.798 0.7980 FALSE
#> 14 concavity_peak_0.25 <= 0.7 0.8059 0.7456 Inf 0.000 0.9855 TRUE
#> 15 concavity_peak_0.3 <= 0.7 0.8673 0.8156 Inf 0.000 1.0000 TRUE
#> reject_all
#> 1 FALSE
#> 2 FALSE
#> 3 TRUE
#> 4 FALSE
#> 5 FALSE
#> 6 FALSE
#> 7 FALSE
#> 8 FALSE
#> 9 FALSE
#> 10 FALSE
#> 11 FALSE
#> 12 FALSE
#> 13 FALSE
#> 14 FALSE
#> 15 FALSE
#>
#> $sensitivity
#> parameter hypothesis estimate lower upper pval pval_all
#> 1 area_peak_500 <= 0.9 0.9977 0.9881 Inf 0.0000 1.0000
#> 2 area_peak_600 <= 0.9 0.9742 0.9429 Inf 0.0000 1.0000
#> 3 area_peak_700 <= 0.9 0.9601 0.9214 Inf 0.0010 0.0010
#> 4 area_peak_800 <= 0.9 0.8850 0.8220 Inf 1.0000 1.0000
#> 5 area_peak_900 <= 0.9 0.8052 0.7269 Inf 1.0000 1.0000
#> 6 compactness_peak_0.1 <= 0.9 0.9836 0.9585 Inf 0.0000 1.0000
#> 7 compactness_peak_0.15 <= 0.9 0.9648 0.9284 Inf 0.0000 1.0000
#> 8 compactness_peak_0.2 <= 0.9 0.8756 0.8104 Inf 1.0000 1.0000
#> 9 compactness_peak_0.25 <= 0.9 0.7441 0.6580 Inf 1.0000 1.0000
#> 10 compactness_peak_0.3 <= 0.9 0.6362 0.5411 Inf 1.0000 1.0000
#> 11 concavity_peak_0.1 <= 0.9 0.9883 0.9670 Inf 0.0000 1.0000
#> 12 concavity_peak_0.15 <= 0.9 0.9789 0.9505 Inf 0.0000 1.0000
#> 13 concavity_peak_0.2 <= 0.9 0.9695 0.9355 Inf 0.0000 0.7980
#> 14 concavity_peak_0.25 <= 0.9 0.9038 0.8455 Inf 0.9855 0.9855
#> 15 concavity_peak_0.3 <= 0.9 0.8052 0.7269 Inf 1.0000 1.0000
#> reject reject_all
#> 1 TRUE FALSE
#> 2 TRUE FALSE
#> 3 TRUE TRUE
#> 4 FALSE FALSE
#> 5 FALSE FALSE
#> 6 TRUE FALSE
#> 7 TRUE FALSE
#> 8 FALSE FALSE
#> 9 FALSE FALSE
#> 10 FALSE FALSE
#> 11 TRUE FALSE
#> 12 TRUE FALSE
#> 13 TRUE FALSE
#> 14 FALSE FALSE
#> 15 FALSE FALSE
visualize(results_bm)
## data splitting:
set.seed(1337)
<- stratified(data, c('diagnosis'), 1/3, bothSets = TRUE)
split <- split[[1]] %>% as.data.frame()
val <- split[[2]] %>% as.data.frame()
trn dim(trn)
#> [1] 379 31
dim(val)
#> [1] 190 31
table(val$diagnosis)
#>
#> 0 1
#> 119 71
## train example model
<- glmnet(x=trn[,-1], y=trn[,1], family="binomial", alpha=0.25)
mod str(mod, 0)
#> List of 13
#> - attr(*, "class")= chr [1:2] "lognet" "glmnet"
set.seed(1337)
## define hyperparameter values for L1/L2 penalty mixing parameter (alpha):
<- c(0, 0.25, 0.5, 0.75, 1)
aa
## train models and create predictions:
<- sapply(aa, function(alpha){
pred_pm <- cv.glmnet(x = as.matrix(trn[,-1]), y=trn[,1],
mod_pm family = "binomial",
type.measure = "class",
alpha = alpha)
message(paste0("cv.glmnet (alpha = ", alpha, "):"))
print(mod_pm)
message("+++++")
predict(mod_pm, as.matrix(val[,-1]), type="response")
})#> cv.glmnet (alpha = 0):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.03847 100 0.01847 0.006853 30
#> 1se 0.04634 98 0.02375 0.008289 30
#> +++++
#> cv.glmnet (alpha = 0.25):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.004383 64 0.01055 0.004299 27
#> 1se 0.012195 53 0.01319 0.005888 23
#> +++++
#> cv.glmnet (alpha = 0.5):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.004612 56 0.01055 0.008061 22
#> 1se 0.012834 45 0.01847 0.008869 19
#> +++++
#> cv.glmnet (alpha = 0.75):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.002802 57 0.007916 0.004023 19
#> 1se 0.009391 44 0.010554 0.005824 18
#> +++++
#> cv.glmnet (alpha = 1):
#>
#> Call: cv.glmnet(x = as.matrix(trn[, -1]), y = trn[, 1], type.measure = "class", family = "binomial", alpha = alpha)
#>
#> Measure: Misclassification Error
#>
#> Lambda Index Measure SE Nonzero
#> min 0.003672 51 0.01319 0.005888 13
#> 1se 0.005328 47 0.01847 0.008822 11
#> +++++
colnames(pred_pm) <- paste0("en", aa*100)
head(pred_pm)
#> en0 en25 en50 en75 en100
#> [1,] 0.9747700 0.9961507 0.9938921 0.9964515 0.9992168
#> [2,] 0.9996722 0.9999897 0.9999674 0.9999877 0.9999995
#> [3,] 0.9142164 0.9763833 0.9698399 0.9816259 0.9949924
#> [4,] 0.9955105 0.9995248 0.9991571 0.9995259 0.9999234
#> [5,] 0.9559590 0.9885991 0.9807955 0.9856935 0.9949629
#> [6,] 0.7573470 0.7981572 0.7785671 0.7485468 0.6770751
## define cutpoints (probability scale):
<- rep(seq(0.1, 0.5, 0.1), 5)
cc <- rep(1:5, each=5)
mm
## create predictions:
<- pred_pm %>%
comp_pm ::categorize(cc, mm) %>%
cases::compare(labels = as.numeric(as.character(val$diagnosis)))
casesstr(comp_pm, 1)
#> List of 2
#> $ specificity:'data.frame': 119 obs. of 25 variables:
#> $ sensitivity:'data.frame': 71 obs. of 25 variables:
## conduct statistical analysis:
set.seed(1337)
<- cases::evaluate(comp_pm, benchmark = benchmark,
results_pm alternative = "greater", alpha = 0.025,
adj="boot", regu=1)
#> Drawing 2000 'pairs' bootstrap samples...
str(results_pm, 1)
#> List of 2
#> $ specificity:'data.frame': 25 obs. of 9 variables:
#> $ sensitivity:'data.frame': 25 obs. of 9 variables:
#> - attr(*, "n")= Named int [1:2] 119 71
#> ..- attr(*, "names")= chr [1:2] "specificity" "sensitivity"
#> - attr(*, "m")= int 25
#> - attr(*, "alpha")= num 0.025
#> - attr(*, "alpha_adj")= logi NA
#> - attr(*, "cv")= num [1:2] -3.65 Inf
#> - attr(*, "class")= chr [1:2] "list" "cases_results"
#> - attr(*, "contrast")= chr [1:2] "raw" NA
#> - attr(*, "benchmark")= num [1:2] 0.7 0.9
#> - attr(*, "alternative")= chr "greater"
#> - attr(*, "analysis")= chr "co-primary"
#> - attr(*, "transformation")= chr "none"
#> - attr(*, "regu")= num [1:3] 1 0.5 0.25
#> - attr(*, "pars")= list()
%>% lapply(filter, reject_all)
results_pm #> $specificity
#> parameter hypothesis estimate lower upper pval pval_all reject
#> 1 en25_0.1 <= 0.7 0.8541667 0.7371061 Inf 0.0095 0.0095 TRUE
#> 2 en50_0.1 <= 0.7 0.8541667 0.7371061 Inf 0.0095 0.0095 TRUE
#> 3 en75_0.1 <= 0.7 0.8708333 0.7595952 Inf 0.0045 0.0045 TRUE
#> 4 en100_0.1 <= 0.7 0.8875000 0.7826976 Inf 0.0040 0.0095 TRUE
#> reject_all
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
#>
#> $sensitivity
#> parameter hypothesis estimate lower upper pval pval_all reject
#> 1 en25_0.1 <= 0.9 0.9791667 0.9181779 Inf 0.0095 0.0095 TRUE
#> 2 en50_0.1 <= 0.9 0.9930556 0.9575948 Inf 0.0000 0.0095 TRUE
#> 3 en75_0.1 <= 0.9 0.9930556 0.9575948 Inf 0.0000 0.0045 TRUE
#> 4 en100_0.1 <= 0.9 0.9791667 0.9181779 Inf 0.0095 0.0095 TRUE
#> reject_all
#> 1 TRUE
#> 2 TRUE
#> 3 TRUE
#> 4 TRUE
visualize(results_pm)
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.