Simulation Study: Comparing Item Selection Criteria and Stopping Rules

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Overview

This vignette presents a simulation study that evaluates how different item selection criteria and stopping rules affect classification accuracy and test efficiency in CD-CAT.

We use the k3_GDINA_HDLV dataset from cdcat_sim: a 3-attribute, 70-item GDINA bank with high-discrimination / low-variability item parameters, and 160 examinees covering all possible attribute profiles (20 per profile).

Experimental factors

Factor	Levels
Item selection criterion	KL, PWKL, MPWKL, SHE
Stopping rule	Single threshold (\(\tau = 0.80\)), Dual threshold (\(\tau_1 = 0.70,\ \tau_2 = 0.10\))

All conditions share the same settings:

Estimation: MAP with uniform prior
Minimum items: 1 (no minimum constraint)
Maximum items: none (test runs until the stopping criterion triggers)
Start item: one randomly drawn item, fixed across all conditions and examinees for comparability

Evaluation metrics

PCCR (Pattern Correct Classification Rate): proportion of examinees whose entire attribute profile was correctly recovered.
ACCR\(_k\) (Attribute CCR): marginal correct classification rate for attribute \(k\).
ATL (Average Test Length): mean number of items administered before the stopping rule triggered.

Setup

library(cdCAT)

# Dataset: K = 3, GDINA model, high-discrimination / low-variability
d <- cdcat_sim[["k3_GDINA_HDLV"]]

Q         <- d$Q          # 70 x 3
alpha     <- d$alpha      # 160 x 3  (true profiles)
responses <- d$responses  # 160 x 70
K         <- ncol(Q)
N         <- nrow(alpha)
J         <- nrow(Q)

# Build the item bank
items <- cdcat_items(Q, "GDINA", gdina_params = d$parameters)
items

## cdCAT Item Bank
##   Model  : GDINA 
##   Items  : 70 
##   Attrs  : 3

# Draw one start item, fixed for all conditions and examinees
set.seed(42)
start_item <- sample(J, 1L)
cat("Fixed start item:", start_item, "\n")

## Fixed start item: 49

Simulation

conditions <- expand.grid(
  criterion  = c("KL", "PWKL", "MPWKL", "SHE"),
  stop_rule  = c("single", "dual"),
  stringsAsFactors = FALSE
)

run_condition <- function(criterion, stop_rule) {

  threshold <- if (stop_rule == "single") 0.80 else c(0.70, 0.10)

  results <- vector("list", N)

  for (i in seq_len(N)) {
    session <- CdcatSession$new(
      items,
      criterion  = criterion,
      method     = "MAP",
      prior      = NULL,        # uniform prior
      threshold  = threshold,
      min_items  = 1L,
      max_items  = J,           # effectively no maximum: bounded by bank size
      start_item = start_item
    )

    repeat {
      item <- session$next_item()
      if (item == 0L) break
      session$update(item, responses[i, item])
    }

    res <- session$result()
    results[[i]] <- list(
      alpha_hat = res$alpha_hat,
      n_items   = res$n_items
    )
  }

  results
}

set.seed(2025)
sim_results <- lapply(seq_len(nrow(conditions)), function(ci) {
  run_condition(conditions$criterion[ci], conditions$stop_rule[ci])
})
names(sim_results) <- paste(conditions$criterion, conditions$stop_rule, sep = "_")

Results

compute_metrics <- function(results, alpha) {
  N  <- length(results)
  K  <- ncol(alpha)

  alpha_hat_mat <- do.call(rbind, lapply(results, `[[`, "alpha_hat"))
  n_items_vec   <- sapply(results, `[[`, "n_items")

  # PCCR: all K attributes correct
  pccr <- mean(apply(alpha_hat_mat == alpha, 1, all))

  # ACCR per attribute
  accr <- colMeans(alpha_hat_mat == alpha)
  names(accr) <- paste0("ACCR_A", seq_len(K))

  # ATL
  atl <- mean(n_items_vec)

  c(PCCR = round(pccr, 4),
    round(accr, 4),
    ATL   = round(atl, 2))
}

metrics_list <- lapply(sim_results, compute_metrics, alpha = alpha)
metrics_df   <- as.data.frame(do.call(rbind, metrics_list))
metrics_df   <- cbind(conditions, metrics_df)
rownames(metrics_df) <- NULL

Full results table

knitr::kable(
  metrics_df,
  col.names = c("Criterion", "Stop rule", "PCCR",
                "ACCR A1", "ACCR A2", "ACCR A3", "ATL"),
  caption   = "Classification accuracy and average test length by condition.",
  digits    = 4,
  align     = "llccccc"
)

Classification accuracy and average test length by condition.
Criterion	Stop rule	PCCR	ACCR A1	ACCR A2	ACCR A3	ATL
KL	single	0.8188	0.9750	0.9312	0.8812	5.66
PWKL	single	0.7688	0.9188	0.9188	0.8812	3.67
MPWKL	single	0.7812	0.9188	0.9188	0.8938	3.67
SHE	single	0.7812	0.9188	0.9188	0.8938	3.67
KL	dual	0.9312	1.0000	0.9500	0.9812	7.68
PWKL	dual	0.8438	0.9250	0.9188	0.9625	4.31
MPWKL	dual	0.8438	0.9250	0.9188	0.9625	4.28
SHE	dual	0.8375	0.9312	0.9188	0.9562	4.24

PCCR and ATL by criterion

agg_crit <- aggregate(
  cbind(PCCR, ATL) ~ criterion,
  data = metrics_df,
  FUN  = mean
)
knitr::kable(
  agg_crit,
  col.names = c("Criterion", "Mean PCCR", "Mean ATL"),
  caption   = "Average PCCR and ATL across stopping rules, by criterion.",
  digits    = 4
)

Average PCCR and ATL across stopping rules, by criterion.
Criterion	Mean PCCR	Mean ATL
KL	0.8750	6.670
MPWKL	0.8125	3.975
PWKL	0.8063	3.990
SHE	0.8094	3.955

PCCR and ATL by stopping rule

agg_stop <- aggregate(
  cbind(PCCR, ATL) ~ stop_rule,
  data = metrics_df,
  FUN  = mean
)
knitr::kable(
  agg_stop,
  col.names = c("Stop rule", "Mean PCCR", "Mean ATL"),
  caption   = "Average PCCR and ATL across criteria, by stopping rule.",
  digits    = 4
)

Average PCCR and ATL across criteria, by stopping rule.
Stop rule	Mean PCCR	Mean ATL
dual	0.8641	5.1275
single	0.7875	4.1675

PCCR heatmap

pccr_mat <- matrix(
  metrics_df$PCCR,
  nrow     = 4,
  dimnames = list(
    Criterion = c("KL", "PWKL", "MPWKL", "SHE"),
    Stop      = c("single", "dual")
  )
)

image(
  t(pccr_mat),
  col    = hcl.colors(20, "Blues", rev = TRUE),
  xaxt   = "n", yaxt = "n",
  zlim   = c(min(pccr_mat) - 0.02, 1),
  main   = "PCCR",
  xlab   = "Stopping rule",
  ylab   = "Criterion"
)
axis(1, at = c(0, 1), labels = c("single", "dual"))
axis(2, at = seq(0, 1, length.out = 4),
     labels = c("KL", "PWKL", "MPWKL", "SHE"), las = 2)
for (i in 1:2) for (j in 1:4) {
  text((i - 1), (j - 1) / 3,
       labels = sprintf("%.3f", pccr_mat[j, i]),
       cex = 0.85)
}

PCCR by criterion and stopping rule.

Summary

The table below recaps the best-performing condition per metric.

best_pccr  <- metrics_df[which.max(metrics_df$PCCR),  c("criterion","stop_rule","PCCR","ATL")]
best_atl   <- metrics_df[which.min(metrics_df$ATL),   c("criterion","stop_rule","PCCR","ATL")]

knitr::kable(
  rbind(
    cbind(Optimises = "PCCR (max)", best_pccr),
    cbind(Optimises = "ATL (min)",  best_atl)
  ),
  row.names = FALSE,
  caption   = "Best condition per optimisation target."
)

Best condition per optimisation target.
Optimises	criterion	stop_rule	PCCR	ATL
PCCR (max)	KL	dual	0.9312	7.68
ATL (min)	PWKL	single	0.7688	3.67

Note. All results are based on a single replication. For publication-quality conclusions, increase N or run multiple replications with different seeds.

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.