The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

reproducibility - simulation

This script requires that the working directory includes the folders “data”, “results”, and “manuscript”. We obtained our results using R 4.3.0 (2023-04-21) with cornet 0.0.8 (2023-06-01) on a local machine (aarch64-apple-darwin20, macOS Ventura 13.4). Floating point errors might lead to slightly different results on other platforms.

Graphical abstract

grDevices::pdf("manuscript/figure_idea.pdf",width=5,height=2.5)

box <- function(x,y,width=0.22,height=0.2,labels="",cex=1,col="black",...){
  xs <- x + 0.5*c(-1,-1,1,1)*width
  ys <- y + 0.5*c(-1,1,1,-1)*height
  graphics::polygon(x=xs,y=ys,border=col,lwd=2,...)
  graphics::text(x=x,y=y,labels=labels,col=col,cex=cex)
}

graphics::par(mar=c(0,0,0,0))
graphics::plot.new()
graphics::plot.window(xlim=c(0,1),ylim=c(0,1))

v <- h <- 0.1

box(x=0+h,y=0.5,labels="outcomes,\nfeatures")
box(x=0.5,y=1-v,labels="initial binary\nclassification",col="red")
box(x=0.5,y=0+v,labels="numerical\nprediction",col="blue")
box(x=1-h,y=0.5,labels="final binary\nclassification",col="red")

d <- 0.02
graphics::arrows(x0=0.2+d,y0=0.5+c(-d,d),x1=0.4-d,y1=c(v,1-v),lwd=2,col=c("blue","red"))
graphics::arrows(x0=0.6+d,y0=c(v,1-v),x1=0.8-d,y1=0.5+c(-d,d),lwd=2,col=c("blue","red"))

graphics::text(x=0.4,y=0.55,labels="binary outcome:\nlogistic regression",col="red",cex=0.7,pos=3)
graphics::text(x=0.4,y=0.45,labels="numerical outcome:\nlinear regression",col="blue",cex=0.7,pos=1)
graphics::text(x=0.63,y=0.5,labels="combine\npredicted\nprobabilities",col="darkgrey",cex=0.7)
graphics::text(x=0.8,y=0.3,labels="transform\npredicted values to\npredicted probabilities",col="darkgrey",cex=0.7,pos=1)

grDevices::dev.off()

Examples

loss <- list()
for(i in seq_len(4)){
  loss[[i]] <- list()
  cat("mode:",i,"\n")
  for(j in seq_len(100)){
    set.seed(j)
    cat("iteration:",j,"\n")
    n0 <- 100; n1 <- 10000; p <- 500
    n <- n0 + n1
    X <- matrix(data=stats::rnorm(n*p),nrow=n,ncol=p)
    beta <- stats::rbinom(n=p,size=1,prob=0.05)*stats::rnorm(n=p)
    eta <- X %*% beta
    epsilon <- stats::rnorm(n=n)
    if(i==1){
      y <- eta + epsilon
    } else if(i==2){
      y <- ifelse(eta<0,-2,+2)+epsilon
      table(y>=0,eta>=0)
    } else if(i==3){
      y <- ifelse(eta<0,-sqrt(abs(eta+epsilon)),(eta+epsilon)^2)
    } else if(i==4){
      y <- eta + epsilon + stats::rbinom(n=n,size=1,prob=0.05)*(2*stats::rbinom(n=n,size=1,prob=0.5)-1)*1.5*max(abs(eta))
    }
    foldid <- rep(c(0,1),times=c(n0,n1))
    loss[[i]][[j]] <- cornet::cv.cornet(y=y,cutoff=0,X=X,foldid.ext=foldid)
  }
}

save(loss,file="results/simulation.RData")
writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
        sessioninfo::session_info()),con="results/info_sim.txt")
load("results/simulation.RData")

grDevices::pdf("manuscript/figure_EXA.pdf",width=5,height=5)

graphics::par(mfrow=c(2,2),mar=c(2,2,1,1))
pos <- c(binomial=1,combined=2,gaussian=3)
col <- c(binomial="red",combined="grey",gaussian="blue")
cex <- 0.7
names <- c("binomial","combined","gaussian")
for(i in seq_len(4)){
  frame <- as.data.frame(t(sapply(loss[[i]],function(x) x$deviance)))
  graphics::boxplot(x=frame[,names],at=pos[names],col=col[names],cex.axis=cex,main=paste0("example ",i),cex.main=cex,axes=FALSE)
  graphics::box()
  graphics::axis(side=1,at=pos[names],labels=names,cex.axis=cex,tick=FALSE,line=-1)
  graphics::axis(side=2,cex.axis=cex)
  for(j in c("binomial","combined","gaussian")){
    mean <- mean(frame[[j]])
    graphics::points(x=pos[j],y=mean,pch=21,col="white",bg="black")
    if(j=="combined"){next}
    pvalue <- stats::wilcox.test(x=frame$combined,y=frame[[j]],alternative="less")$p.value
    signif <- ifelse(pvalue<=0.05/8,"*","")
    graphics::text(x=mean(c(pos["combined"],pos[[j]])),y=min(frame),labels=paste0("p=",format(pvalue,digits=2,scientific=TRUE),signif),pos=3,cex=0.7)
  }
}

grDevices::dev.off()

(not included)

iter <- 1000
set.seed(1)
frame <- data.frame(cor=runif(n=iter,min=0,max=0.9),
                    n=round(runif(n=iter,min=100,max=200))+10000,
                    prob=runif(n=iter,min=0.01,max=0.1),
                    sd=runif(n=iter,min=1,max=2),
                    exp=runif(n=iter,min=0.1,max=2),
                    frac=runif(n=iter,min=0.5,max=0.9))

ridge <- lasso <- list()
pb <- utils::txtProgressBar(min=0,max=nrow(frame),width=20,style=3)
for(i in seq_len(nrow(frame))){
    utils::setTxtProgressBar(pb=pb,value=i)
    set.seed(i)
    data <- do.call(what=cornet:::.simulate,args=cbind(frame[i,],p=500))
    foldid <- rep(c(0,1),times=c(frame$n[i],10000))
    set.seed(i)
    ridge[[i]] <- do.call(what=cornet:::cv.cornet,args=c(data,alpha=0,foldid=foldid))
    set.seed(i)
    lasso[[i]] <- do.call(what=cornet:::cv.cornet,args=c(data,alpha=1,foldid=foldid))
}
names(lasso) <- names(ridge) <- paste0("set",seq_len(nrow(frame)))
save(lasso,ridge,frame,file="results/simulation.RData")

writeLines(text=capture.output(utils::sessionInfo(),cat("\n"),
        sessioninfo::session_info()),con="results/info_sim.txt")
#--- boxplot of different metrics ---
load("results/simulation.RData",verbose=TRUE)

fuse0 <- fuse1 <- list()
for(i in c("deviance","class","mse","mae","auc")){
  fuse0[[i]] <- sapply(ridge,function(x) (x[[i]]["combined"]-x[[i]]["binomial"]))
  fuse1[[i]] <- sapply(lasso,function(x) (x[[i]]["combined"]-x[[i]]["binomial"]))
}

grDevices::pdf("manuscript/figure_BOX.pdf",width=6,height=4)
graphics::par(mar=c(1.9,1.9,0.1,0.1))
graphics::plot.new()
ylim <- range(unlist(fuse0),unlist(fuse1))
at <- seq(from=1,to=9,by=2)
graphics::plot.window(xlim=c(min(at)-0.6,max(at)+0.6),ylim=ylim)
graphics::axis(side=2)
graphics::abline(h=0,col="grey",lty=2)
graphics::abline(v=at+1,col="grey",lty=2)
graphics::box()
graphics::boxplot(fuse1,at=at-0.5,add=TRUE,axes=FALSE,col="white",border="black")
graphics::boxplot(fuse0,at=at+0.5,add=TRUE,axes=FALSE,col="white",border="darkgrey")
labels <- names(fuse1)
labels <- ifelse(labels=="class","mcr",labels)
labels <- ifelse(labels %in% c("mcr","mse","mae","auc"),toupper(labels),labels)
for(i in seq_along(labels)){
  graphics::axis(side=1,at=at[i],labels=bquote(Delta ~ .(labels[i])))
}
grDevices::dev.off()

# decrease
sapply(fuse1,function(x) mean(x<0)) # lasso
sapply(fuse0,function(x) mean(x<0)) # ridge

# constant
sapply(fuse1,function(x) mean(x==0)) # lasso
sapply(fuse0,function(x) mean(x==0)) # ridge

# increase
sapply(fuse1,function(x) mean(x>0)) # lasso
sapply(fuse0,function(x) mean(x>0)) # ridge
#--- plot of percentage changes ---
load("results/simulation.RData",verbose=TRUE)

loss <- list()
loss$ridge <- as.data.frame(t(sapply(ridge,function(x) x$deviance)))
loss$lasso <- as.data.frame(t(sapply(lasso,function(x) x$deviance)))

data <- list()
for(i in c("ridge","lasso")){
  data[[i]] <- data.frame(row.names=rownames(frame))
  data[[i]]$"(1)" <- 100*(loss[[i]]$binomial-loss[[i]]$intercept)/loss[[i]]$intercept
  data[[i]]$"(2)" <- 100*(loss[[i]]$combined-loss[[i]]$intercept)/loss[[i]]$intercept
  data[[i]]$"(3)" <- 100*(loss[[i]]$combined-loss[[i]]$binomial)/loss[[i]]$binomial
}

row <- colnames(data$lasso)
col <- colnames(frame)
txt <- expression(rho,n,s,sigma,t,q)

for(k in c("ridge","lasso")){
  grDevices::pdf(paste0("manuscript/figure_",k,".pdf"),width=6.5,height=4)
  graphics::par(mfrow=c(length(row),length(col)),
              mar=c(0.2,0.2,0.2,0.2),oma=c(4,4,0,0))
  for(i in seq_along(row)){
    for(j in seq_along(col)){
      y <- data[[k]][[row[i]]]
      x <- frame[[col[j]]]
      graphics::plot.new()
      graphics::plot.window(xlim=range(x),ylim=range(y),xaxs="i")
      graphics::box()
      graphics::abline(h=0,lty=1,col="grey")
      graphics::points(y=y,x=x,cex=0.5,pch=16,col=ifelse(y>0,"black","grey"))
      line <- stats::loess.smooth(y=y,x=x,evaluation=200)
      graphics::lines(x=line$x,y=line$y,col="black",lty=2,lwd=1)
      if(j==1){
        graphics::mtext(text=row[i],side=2,line=2.5,las=2)
        graphics::axis(side=2)
      }
      if(i==length(row)){
        graphics::mtext(text=txt[j],side=1,line=2.5)
        graphics::axis(side=1)
      }
    }
  }
  grDevices::dev.off()
}

cbind(col,as.character(txt)) # verify

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.