The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Cross-validation

Comparing cross-validation methods

library(data.table)
n.segments <- 10
seg.mean.vec <- 1:n.segments
data.per.segment <- 10
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 100
n.folds.vec <- c(10, 2)
prop.valid.vec <- 1/n.folds.vec
sim.result.list <- list()
if(interactive()){
  for(data.seed in 1:100){
    set.seed(data.seed)
    data.vec <- rnorm(n.data, data.mean.vec, 0.1)
    is.valid.vec.list <- list()
    for(n.folds in n.folds.vec){
      uniq.folds <- 1:n.folds
      n.seeds <- n.validation.sets/n.folds
      split.type <- sprintf("%d-fold %d times", n.folds, n.seeds)
      for(seed in 1:n.seeds){
        set.seed(seed)
        fold.vec <- sample(rep(uniq.folds, l=n.data))
        for(valid.fold in uniq.folds){
          is.valid.vec.list[[split.type]][[paste(seed, valid.fold)]] <-
            fold.vec==valid.fold
        }
      }
    }
    for(prop.valid in prop.valid.vec){
      split.type <- sprintf("%d%% %d times", 100*prop.valid, n.validation.sets)
      prop.vec <- c(subtrain=1-prop.valid, validation=prop.valid)
      for(split.i in 1:n.validation.sets){
        set.seed(split.i)
        is.valid.vec.list[[split.type]][[split.i]] <- binsegRcpp::random_set_vec(
          n.data, prop.vec) == "validation"
      }
    }
    loss.dt <- CJ(split.i=1:n.validation.sets, type=names(is.valid.vec.list))[, {
      is.valid <- is.valid.vec.list[[type]][[split.i]]
      bs.model <- binsegRcpp::binseg_normal(data.vec, is.validation.vec=is.valid)
      bs.model$splits[, data.table(
        segments,
        validation.loss)]
    }, by=.(split.i, type)]
    loss.stats <- loss.dt[, .(
      mean.valid.loss=mean(validation.loss)
    ), by=.(type, segments)]
    select.each.split <- loss.dt[
    , .SD[which.min(validation.loss)],
      by=.(type, split.i)]
    selected.times <- select.each.split[, .(
      times=.N
    ), by=.(type, segments)]
    selected.segments <- rbind(
      select.each.split[, .(
        selected=min(segments)
      ), by=.(method=paste(type, "min err, min segs"))],
      selected.times[, .(
        selected=segments[which.max(times)]
      ), by=.(method=paste(type, "min err, max times"))],
      loss.stats[, .(
        selected=segments[which.min(mean.valid.loss)]
      ), by=.(method=paste(type, "mean err, min err"))]
    )
    sim.result.list[[data.seed]] <- data.table(
      data.seed, selected.segments, n.segments)
  }
  sim.result <- do.call(rbind, sim.result.list)
  (sim.err <- sim.result[, .(
    zero.one.loss=sum(selected != n.segments),
    L1.loss=sum(abs(selected-n.segments)),
    L2.loss=sum((selected-n.segments)^2)
  ), by=method][order(zero.one.loss)])
  plot(data.vec)
}

The code above compares several types of cross-validation for selecting the number of segments in simulated random normal data. The table above shows various error rates which compare the selected number of segments to the true number of segments in the simulation. The best methods appears to be the ones which use min err, max times.

How many times is sufficient?

n.segments <- 20
seg.mean.vec <- 1:n.segments
data.per.segment <- 5
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 200
prop.valid <- c(0.01, 0.05, 0.1, 0.25, 0.5)
if(interactive()){
  sim.result <- data.table(data.seed=1:100)[, {
    set.seed(data.seed)
    data.vec <- rnorm(n.data, data.mean.vec, 0.1)
    select.each.split <- CJ(split.i=1:n.validation.sets, prop.valid)[, {
      set.seed(split.i)
      prop.sets <- c(subtrain=1-prop.valid, validation=prop.valid)
      is.valid <- binsegRcpp::random_set_vec(
        n.data, prop.sets)=="validation"
      bs.model <- binsegRcpp::binseg_normal(
        data.vec, is.validation.vec=is.valid)
      bs.model$splits[, .(selected=segments[which.min(validation.loss)])]
    }, by=.(split.i, prop.valid)]
    data.table(n.splits=1:n.validation.sets)[, {
      select.each.split[split.i <= n.splits, .(
        times=.N
      ),
      by=.(prop.valid, selected)
      ][, .SD[which.max(times), .(selected)], by=prop.valid]
    }, by=n.splits]
  }, by=data.seed]

  if(require(ggplot2)){
    ggplot()+
      scale_color_gradient(low="red", high="black")+
      geom_line(aes(
        n.splits, selected,
        group=paste(data.seed, prop.valid),
        color=prop.valid),
        data=sim.result)+
      scale_y_continuous(breaks=seq(0, 100, by=10))
  }

  accuracy.dt <- sim.result[, .(
    correct=sum(selected==n.segments)
  ), by=.(prop.valid, n.splits)]
  if(require(ggplot2)){
    gg <- ggplot()+
      geom_line(aes(
        n.splits, correct,
        group=prop.valid,
        color=prop.valid),
        size=2,
        data=accuracy.dt)+
      scale_y_continuous("number of correctly chosen data sets")+
      scale_color_gradient(low="red", high="black")
    if(require(directlabels)){
      direct.label(gg, "right.polygons")
    }else{
      gg
    }
  }
}

The plot above suggests that 100 validation sets is sufficient.

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.