The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
This package provides functions for generating all possible splits of variables into groups, and computing the best split selection regression estimator for low-dimensional data.
You can install the stable version on R CRAN.
install.packages("splitSelect", dependencies = TRUE)
You can install the development version from GitHub.
library(devtools)
::install_github("AnthonyChristidis/splitSelect") devtools
Here is some code to generate all possible splits of variables into groups.
# Loading library
library(splitSelect)
# Setting number of variables and groups
<- 8
p <- 4
G <- TRUE
use.all
# Generate the number of partitions
<- generate_partitions(p, G, use.all=use.all)
my.partitions
my.partitions
# Generate the number of splits
nsplit(p, G, use.all=use.all)
# Generate the number of splits (fixed partition)
nsplit(p, G, use.all=use.all,
fix.partition=matrix(c(2,2,2,2), nrow=1))
# Generate the splits
<- generate_splits(p, G, use.all=use.all)
all.splits head(all.splits)
nrow(all.splits)
# Generate the splits (fixed partition)
<- generate_splits(p, G, use.all=use.all,
all.splits fix.partition=matrix(c(2,2,2,2), nrow=1))
head(all.splits)
nrow(all.splits)
# Generate samples of splits
<- rsplit(10000, p, G, fix.partition=matrix(c(2,2,2,2), nrow=1))
sample.splits sample.splits
Here is some code to apply to compute the best split selection estimator for simulated data with spurious correlation in the training set.
# Download the packages
install.packages("simTargetCov")
install.packages("glmnet")
install.packages("SplitReg")
# Setting the parameters
<- 6
p <- 30
n <- 5000
n.test <- 5
group.beta <- c(rep(1, 2), rep(group.beta, p-2))
beta <- 0.1
rho <- 0.9
r <- 3
SNR # Creating the target matrix with "kernel" set to rho
<- function(r, p){
target_cor <- diag(p)
Gamma for(i in 1:(p-1)){
for(j in (i+1):p){
<- Gamma[j,i] <- r^(abs(i-j))
Gamma[i,j]
}
}return(Gamma)
}# AR Correlation Structure
<- target_cor(r, p)
Sigma.r <- target_cor(rho, p)
Sigma.rho <- as.numeric(sqrt((t(beta) %*% Sigma.rho %*% beta)/SNR))
sigma.epsilon
# Number of cores
<- parallel::detectCores()-1
nb.cores # Registering the clusters
<- parallel::makeCluster(nb.cores)
cl ::registerDoParallel(cl)
doParallel
# Set the seed
set.seed(0)
# Simulate some data
<- simTargetCov::simTargetCov(n=n, p=p, target=Sigma.r)
x.train <- 1 + x.train %*% beta + rnorm(n=n, mean=0, sd=sigma.epsilon)
y.train <- mvnfast::rmvn(n.test, mu=rep(0,p), sigma=Sigma.rho)
x.test <- 1 + x.test %*% beta + rnorm(n.test, sd=sigma.epsilon)
y.test
# Best Split Selection for Regression
system.time(
<- cv.splitSelect(x.train, y.train, G=2, use.all=TRUE,
split.out fix.partition=list(matrix(c(2,4,
3,3), ncol=2, byrow=TRUE)), fix.split=NULL,
intercept=TRUE, group.model="glmnet", alpha=0, nfolds=10,
parallel=TRUE, cores=nb.cores)
)<- predict(split.out, newx=x.test)
split.predictions mean((split.predictions-y.test)^2)/sigma.epsilon^2
# Ending the cluster
::stopCluster(cl)
parallel
# Ridge Regression
<- glmnet::cv.glmnet(x.train, y.train, alpha=0)
cv.ridge <- glmnet::glmnet(x.train, y.train, alpha=0, lambda=cv.ridge$lambda.min)
ridge <- predict(ridge, newx=x.test)
ridge.predictions mean((ridge.predictions-y.test)^2)/sigma.epsilon^2
# Lasso
<- glmnet::cv.glmnet(x.train, y.train, alpha=1)
cv.lasso <- glmnet::glmnet(x.train, y.train, alpha=1, lambda=cv.lasso$lambda.min)
lasso <- predict(lasso, newx=x.test)
lasso.predictions mean((lasso.predictions-y.test)^2)/sigma.epsilon^2
# Elastic Net
<- glmnet::cv.glmnet(x.train, y.train, alpha=3/4)
cv.elastic <- glmnet::glmnet(x.train, y.train, alpha=3/4, lambda=cv.elastic$lambda.min)
elastic <- predict(elastic, newx=x.test)
elastic.predictions mean((elastic.predictions-y.test)^2)/sigma.epsilon^2
# SplitReg
<- SplitReg::cv.SplitReg(x.train, y.train, num_models=3, alpha=1e-2)
cv.splitreg <- predict(cv.splitreg, newx=x.test)
splitreg.predictions mean((splitreg.predictions-y.test)^2)/sigma.epsilon^2
# Looking at the MSPEs for all the possible splits (out-of-sample)
<-
split.mspes sapply(1:nrow(split.out$splits), function(x, n.test, x.test, split.out, y.test)
mean((y.test-cbind(rep(1, n.test), x.test) %*% split.out$betas[,x])^2)},
{/sigma.epsilon^2
n.test, x.test, split.out, y.test)# Minimum MSPE for the splits (out-of-sample)
min(split.mspes)
$optimal.split]
split.mspes[split.out# Optimal splits comparison (out-of-sample)
$splits[which.min(split.mspes),]
split.out$optimal.split.var
split.out# Optimal betas comparison (out-of-sample)
$betas[,which.min(split.mspes), drop=FALSE]
split.outcoef(split.out)
This package is free and open source software, licensed under GPL (>= 2).
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.