The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

rSDR vignette

Sheau-Chiann Chen, Shilin Zhao and Hsin-Hsiung Bill Huang

Introduction

This R Markdown document demonstrates the usage of the rSDR package for Robust Sufficient Dimension Reduction (rSDR) using alpha-distance covariance and Stiefel manifold learning for estimation (Hsin-Hsiung Huang, Feng Yu & Teng Zhang, 2024). We apply the method to ionosphere dataset, including cross-validation, bootstrap methods for optimal alpha selection, and visualization of results for both 2 and 3 dimensions.

Setup

# 
# # Load rSDR package
# if (!requireNamespace("rSDR", quietly = TRUE)) {
#   if (!requireNamespace("devtools", quietly = TRUE)){   install.packages("devtools")
# }
#   library(devtools)
#   devtools::install_github("scchen1/rSDR")
#   # or
#   
#   #a local zip file (`rSDR-main.zip`) downloaded from GitHub
#   #devtools::install_local(path='yourpath/rSDR-main.zip')
# }


library(rSDR)

Data Preparation

We use the ionosphere dataset from the fdm2id package. The dataset has 33 predictor variables (V1-V33) and one response variable (V35, ‘g’ for good or ‘b’ for bad).

The data include a response variable Y with nx1 matrix and predictors X with nxp matrix, where n is the number of observation and p is the number of variable.

#Load ionosphere data in fdm2id package
utils::data("ionosphere", package = "fdm2id")

X<-as.matrix(ionosphere[,c(1:33)])
Y<-ifelse(ionosphere[,34]=='b',0,1)
# Y will be used for rSDR
Y<-matrix(Y,length(Y),1)

#Y.factor will be used in plot_rSDR
Y.factor<-factor(ionosphere$V35,levels=c('b','g'),labels=c('Bad','Good'))

Robust Sufficient Dimension Reduction

Example: 3-Dimensional Reduction

set.seed(2435)
sdr_result<-rSDR(X=X, Y=Y, d=3, alpha=0.3,maxiter=1000,tol=1e-7)


# An optimal of C is obtained by maximizing the target function using 
# ManifoldOptim method
head(sdr_result$C_value)
#>              [,1]        [,2]        [,3]
#> [1,]  0.060376161  0.12984351  0.32228204
#> [2,]  0.141480457 -0.09261385  0.08642944
#> [3,] -0.037722305  0.06518416  0.14365305
#> [4,]  0.205080004 -0.30551967 -0.00615399
#> [5,] -0.005404643  0.10617035  0.17880486
#> [6,]  0.225708005  0.06038528 -0.06383628

#The value of cost function f is equal to the negative of the target function.
sdr_result$f_value
#> [1] -0.05426311

#beta=sigma_x^(-0.5)%*%C 
head(sdr_result$beta)
#>              [,1]       [,2]       [,3]
#> [1,]  0.005052794  0.7568295  0.5552178
#> [2,]  0.104607845 -0.4298763  0.1027418
#> [3,] -0.286795008  0.3727621  0.8018906
#> [4,]  0.306400216 -0.7760211 -0.3561266
#> [5,] -0.161184953  0.2073087  0.5865177
#> [6,]  0.297500484  0.5492788 -0.5587245

#projected_data: This projects X onto the estimated SDR directions (X %*% beta),
#the projected matrix (data) n x d
head(sdr_result$projected_data)
#>            V1          V2         V3
#> 1  0.49558138  0.09203527 0.15575243
#> 2  0.22281560  1.53635693 1.15002405
#> 3  0.52522648 -0.25455816 0.07992585
#> 4 -0.12508370  3.09342626 1.02262469
#> 5  0.25594361 -0.02213188 0.17249223
#> 6 -0.09054116  1.13584722 0.86440192

Visualizing the 3D Projected Data

#Plot for 3-dimensional reduction using rSDR

plot_rSDR(projected_data=sdr_result$projected_data,Y=Y.factor,Y.name='group',colors=c("#374E55FF", "#DF8F44FF"))
Plot for 3-dimensional reduction using rSDR
Plot for 3-dimensional reduction using rSDR

Cross-Validation for Optimal Alpha (d=3)

# plan(multisession) will launch parallel workers running in the background
# to save running time. To shut down background workers launched this way, call
# plan(sequential)
# use all local cores except one
# future::plan(future::multisession, workers = future::availableCores() - 1)
# use 2 cores for parallel
future::plan("multisession", workers = 2)
set.seed(2435)
opt_results<-optimal_alpha_cv(alpha.v=c(0.3, 0.5, 0.7),X=X,Y=Y,d=3,kfolds=10)
#> This will take a few seconds or minutes to get the results.
#opt_results<-optimal_alpha_cv(alpha.v=c(0.3, 0.4, 0.5, 0.6, 0.7),X=X,Y=Y,d=3,kfolds=10)

# Optimal alpha using cross validation
opt_results$opt.alpha
#> [1] 0.7

# The cost function values by alpha (column) for each cross validation (row) 
head(opt_results$f_test)
#>              0.3         0.5         0.7
#> [1,] -0.04919316 -0.05200294 -0.04633166
#> [2,] -0.06920450 -0.08929782 -0.09433253
#> [3,] -0.04472843 -0.05253492 -0.05533650
#> [4,] -0.05847677 -0.04592416 -0.05414926
#> [5,] -0.08752592 -0.09585069 -0.09452867
#> [6,] -0.02952418 -0.02512230 -0.02467935

# The mean of cost function value by alpha 
opt_results$f_test.mean
#>         0.3         0.5         0.7 
#> -0.06205520 -0.06120896 -0.06648143

# The standard deviation of cost function value by alpha 
opt_results$f_test.sd
#>        0.3        0.5        0.7 
#> 0.01926043 0.02119046 0.02535447

# The reduced dimension
opt_results$d
#> [1] 3

# Number of folds
opt_results$kfolds
#> [1] 10
plot_alpha(opt_results=opt_results)
Mean and standard deviation of cost function (k-fold cross-validation method)
Mean and standard deviation of cost function (k-fold cross-validation method)

Bootstrap for Optimal Alpha (d=3)

# plan(multisession) will launch parallel workers running in the background
# to save running time. To shut down background workers launched this way, call
# plan(sequential)
# use all local cores except one
# future::plan(future::multisession, workers = future::availableCores() - 1)
# use 2 cores for parallel
future::plan("multisession", workers = 2)
set.seed(2435)
opt_results<-optimal_alpha_boot(alpha.v=c(0.3,0.5, 0.7),X=X,Y=Y,d=3,R=10)
#> This will take a few seconds or minutes to get the results.
#opt_results<-optimal_alpha_boot(alpha.v=c(0.3,0.4,0.5,0.6, 0.7),X=X,Y=Y,d=3,R=50)

# Optimal alpha using bootstrap method
opt_results$opt.alpha
#> [1] 0.7

# The cost function values by alpha (column) for R bootstrap replicates (row) 
head(opt_results$f_test)
#>              0.3         0.5         0.7
#> [1,] -0.04123228 -0.06510945 -0.06349116
#> [2,] -0.05346989 -0.05615554 -0.05670479
#> [3,] -0.06034808 -0.06623436 -0.07687898
#> [4,] -0.06069561 -0.06189878 -0.05231286
#> [5,] -0.06213373 -0.04805119 -0.05082178
#> [6,] -0.05106833 -0.04766037 -0.05188302

# The mean of cost function value by alpha 
opt_results$f_test.mean
#>         0.3         0.5         0.7 
#> -0.05348157 -0.05805082 -0.05836145

# The standard deviation of cost function value by alpha 
opt_results$f_test.sd
#>         0.3         0.5         0.7 
#> 0.007649731 0.007248535 0.008392497

# The reduced dimension
opt_results$d
#> [1] 3

# Number of bootstrap replicates
opt_results$R
#> [1] 10
plot_alpha(opt_results=opt_results)
Mean and standard deviation of cost function (bootstrap method)
Mean and standard deviation of cost function (bootstrap method)

Example: 2-Dimensional Reduction and Plotting

set.seed(2435)
sdr_result<-rSDR(X=X, Y=Y, d=2, alpha=0.3,maxiter=1000,tol=1e-7)

# An optimal of C is obtained by maximizing the target function using ManifoldOptim method
head(sdr_result$C_value)
#>              [,1]        [,2]
#> [1,] -0.195840175  0.01920542
#> [2,] -0.008489214  0.36357606
#> [3,] -0.089659622 -0.06152668
#> [4,]  0.025908724  0.35114685
#> [5,]  0.206643972  0.12064514
#> [6,] -0.317245433  0.13294555

#The value of cost function f is equal to the negative of the target function.
sdr_result$f_value
#> [1] -0.04539063

#beta=sigma_x^(-0.5)%*%C 
head(sdr_result$beta)
#>            [,1]        [,2]
#> [1,] -0.4503527 -0.46366907
#> [2,]  0.4871003  0.93057571
#> [3,] -0.1964792  0.06231069
#> [4,] -0.1416008  0.54261663
#> [5,]  0.8963119  0.19068888
#> [6,] -1.0070804 -0.46193622

#projected_data: This projects X onto the estimated SDR directions (X %*% beta), the projected matrix (data) n x d
head(sdr_result$projected_data)
#>           V1         V2
#> 1 -0.3147479  0.3907114
#> 2  0.5904977  0.8874504
#> 3 -0.2551743  0.3706773
#> 4 -0.1722560 -1.2104044
#> 5 -0.4147929  0.3816966
#> 6 -0.2916534 -0.4002997
#Plot for 2-dimensional reduction using rSDR
plot_rSDR(projected_data=sdr_result$projected_data,Y=Y.factor,Y.name='group',colors=c("#374E55FF", "#DF8F44FF"))

Conclusion

This document tested the core functionality of the rSDR package, including:

The package successfully processes the ionosphere dataset, optimizes the alpha parameter, and produces meaningful visualizations of the reduced dimensions.

Reference

Hsin-Hsiung Huang, Feng Yu & Teng Zhang (19 Feb 2024): Robust sufficient dimension reduction via α-distance covariance, Journal of Nonparametric Statistics, DOI:10.1080/10485252.2024.2313137

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.