When you want to test multiple formulas at the same time without
looking at a plot, you need an automated way to rank formulas. The
following sections show a way to do that.
Differential analysis
First join in a single data.frame the new formulas to analyze, and
define a “best” objective function you want to overcome.
opt.results <- do.call(rbind, prev.res)
best.obj <- 0.1
Define a fitness function, that will have higher values on better
formulas. You can use the built-in function in symbolicr
symbolicr::pe.r.squared.formula.len.fitness or define your
own as below:
my.fitness <- function(errs.m, max.formula.len){
x0 <- 0.4
r <- as.double(errs.m$base.r.squared)
flen <- as.integer(errs.m$formula.len)
pe <- as.double(errs.m$base.pe)
denominator <- exp(10*pe)
numerator <- sign(r)*(r*(max.formula.len/flen))^2
numerator / denominator
}
Then this procedure reports all the formulas that have a fitness
function higher than the one defined
new.res <- lapply(seq(length(prev.res)), function(i){
# get current data store
opt.results <- prev.res[[i]]
howmany <- names(prev.res)[i]
# select howmany rows should be checked (set howmany to nrow(opt.results) to check all)
rows <- seq(nrow(opt.results)-as.integer(howmany)+1, nrow(opt.results))
## COMPUTE FITNESS FOR EVERY NEW RESULT COMPUTED
check.res <- opt.results[rows, ]
check.res$obj <- apply(check.res, MARGIN=1, FUN=function(row) my.fitness(as.data.frame(t(row)), max.formula.len))
promising.res.idxs <- which(check.res$obj >= best.obj)
if(length(promising.res.idxs) == 0)
return(check.res)
else
print(paste0("Updating dataset ",i,"..."))
promising.res <- check.res[promising.res.idxs, ]
max.n.squares <- max(promising.res$n.squares)
## (SLOW) CROSS-VALIDATE HARDER THE PROMISING RESULTS (the one with fitness higher than best.obj)
# rechecked.promising.res.l <- apply(promising.res, MARGIN=1, FUN=function(row){
# cur.best.vars <- strsplit(row[['vars']], ",")[[1]]
# set.seed(0)
# print(cur.best.vars)
# experiments <- cross.validate(regressors.df, l.F2, cur.best.vars, custom.abs.mins = list(), K=K, N=N, n.squares=max.n.squares,
# transformations=transformations, cv.norm = T)
#
# base.best.errors <- data.frame(base.pe=round(mean(experiments[, 'base.pe']), 3),
# #base.pe.sd=round(sd(experiments[, 'base.pe']), 3),
# base.cor=round(mean(experiments[, 'base.cor']), 3),
# #base.cor.sd=round(sd(experiments[, 'base.cor']), 3),
# base.r.squared=round(mean(experiments[, 'base.r.squared']), 3),
# #base.r.squared.sd=round(sd(experiments[, 'base.r.squared']), 3)
# base.max.pe=round(mean(experiments[, 'base.max.pe']), 3),
# base.iqr.pe=round(mean(experiments[, 'base.iqr.pe']), 3),
# base.max.cooksd=round(mean(experiments[, 'base.max.cooksd']), 3),
# base.max.cooksd.name=paste(unique(experiments$base.max.cooksd.name), collapse=","),
# vars=row[['vars']],
# n.squares=row[['n.squares']],
# formula.len=row[['formula.len']]
# )
# base.best.errors$obj <- pe.r.squared.formula.len.fitness(base.best.errors, max.formula.len = max.formula.len)
# base.best.errors
# })
#
# rechecked.promising.res<-do.call(rbind, rechecked.promising.res.l)
#
# # update & reorder
# check.res[promising.res.idxs, ] <- rechecked.promising.res
return(check.res)
})
# see new candidates better than best.vars
better.res <- lapply(new.res, function(opt.results){
opt.results[opt.results$obj>=best.obj, ]
})
better.res