The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

smoothPLS_multi_states_03

This notebook show the Smooth PLS algorithm for a multi-states Categorical Functional Data (MS-CFD).

Parameters


N_states = 3

nind = 100 # number of individuals (train set)
start = 0 # First time
end = 100 # end time

curve_type = 'cat'

TTRatio = 0.2 # Train Test Ratio means we have floor(nind*TTRatio/(1-TTRatio))
NotS_ratio = 0.2 # noise variance over total variance for Y
beta_0_real=65.4321 # Intercept value for the link between X(t) and Y

nbasis = 10 # number of basis functions
norder = 4 # 4 for cubic splines basis

regul_time = seq(start, end, 5) # regularisation time sequence
regul_time_0 = seq(start, end, 1)

int_mode = 1 # in case of integration errors.

Data generation

lambda_determination

# Initialized the lambdas values
lambdas = lambda_determination(N_states)
lambdas
#> [1] 0.16218960 0.09130628 0.07550633

tranfer_probabilities

# Initialized the transition matrix
transition_df = transfer_probabilities(N_states)
transition_df
#>              dir1      dir2      dir3
#> state_1 0.0000000 0.7863556 0.2136444
#> state_2 0.4314136 0.0000000 0.5685864
#> state_3 0.5306123 0.4693877 0.0000000

df_cfd

df_cfd = generate_X_df_multistates(nind = nind, N_states, start, end,
                              lambdas,  transition_df)
head(df_cfd)
#>   id       time state
#> 1  1  0.0000000     3
#> 2  1  0.4182081     2
#> 3  1  1.5448560     3
#> 4  1  5.7064580     2
#> 5  1 35.5646072     1
#> 6  1 44.2917255     3

plot_CFD_individuals(df_cfd)

plot_CFD_individuals(df_cfd, by_cfda = TRUE)

Basis creation

All the states will share the same basis.

basis = create_bspline_basis(start, end, nbasis, norder)
#basis = fda::create.fourier.basis(c(start,end), nbasis = nbasis)

# All the states will share the same basis.
basis_list = obj_list_creation(N_rep = N_states, obj = basis)

plot(basis, main=paste0(nbasis, " ", basis$type," functions basis"))

Data processing

We have to prepare the data before the FPLS method. For each state, we build its indicator function.

names(df_cfd)
#> [1] "id"    "time"  "state"

cat_data_to_indicator

df_processed = cat_data_to_indicator(df_cfd)

length(df_processed)
#> [1] 3
names(df_processed)
#> [1] "state_1" "state_2" "state_3"

head(df_processed$state_3, 20)
#>    id        time state
#> 1   1   0.0000000     1
#> 2   1   0.4182081     0
#> 3   1   1.5448560     1
#> 4   1   5.7064580     0
#> 6   1  44.2917255     1
#> 7   1  49.3986723     0
#> 12  1  94.5885722     1
#> 13  1 100.0000000     1
#> 14  2   0.0000000     0
#> 16  2  23.5972614     1
#> 17  2  35.2188747     0
#> 19  2  62.3399729     1
#> 20  2  69.0495525     0
#> 23  2 100.0000000     0
#> 24  3   0.0000000     1
#> 25  3   7.7964939     0
#> 27  3  15.5383881     1
#> 28  3  26.7390645     0
#> 29  3  38.7901396     1
#> 30  3  60.9841073     0

beta list

##### beta_real #####
###### beta_0_real ######
beta_0_real
#> [1] 65.4321

beta_func_list = beta_list_generation(N_states = N_states)

for(i in 1:length(beta_func_list)){
  plot(0:end,  beta_func_list[[i]](0:end, end_time = end),
       ylab=paste0("Beta(t) n°=", i), type = 'l')
  title(paste0("Beta(t) n°=", i))
}

Y evaluation

Y generation is based on the following equation : $Y = 0 + {i=1}^K _0^T _i(t) ind_i(t) dt $ with $ind_i(t) = \{0, 1\}_{t \in [0, T]}$ the indicator function of the state $i$.

We link $\beta_i$ with the $state_i$.

Y_df = generate_Y_df(df = df_processed, curve_type = curve_type,
                     beta_real_func_or_list =  beta_func_list,
                     beta_0_real = beta_0_real,
                     NotS_ratio = NotS_ratio)
Y = Y_df$Y_noised
names(Y_df)
#> [1] "id"       "Y_beta1"  "Y_beta2"  "Y_beta3"  "Y_real"   "Y_noised"

head(Y_df)
#>   id    Y_beta1    Y_beta2     Y_beta3    Y_real  Y_noised
#> 1  1  65.806580  -8.252748  -7.4320011 115.55393 117.36995
#> 2  2  26.027793   3.651140   0.2430665  95.35410  98.68405
#> 3  3  18.239666  -6.248977 -19.0326323  58.39016 102.56355
#> 4  4   5.205643  22.113474 -19.5805494  73.17067  85.04207
#> 5  5 -12.639639  38.985054  -9.9986223  81.77889  49.19584
#> 6  6 -13.391616 -20.803539 -20.0241998  11.21275  -6.47791

Test set

nind_test = floor(nind*TTRatio/(1-TTRatio))
df_test = generate_X_df_multistates(nind = nind_test, N_states, start, end,
                              lambdas,  transition_df)

df_test_processed = cat_data_to_indicator(df_test)

Y_df_test = generate_Y_df(df_test_processed, curve_type = curve_type,
                          beta_real_func_or_list = beta_func_list,
                          beta_0_real = beta_0_real,
                          NotS_ratio= NotS_ratio)

PLS functions

Naive PLS

naivePLS_obj = naivePLS(df_list = df_cfd, Y = Y, regul_time_obj = regul_time,
                        curve_type_obj = 'cat', 
                        id_col_obj = 'id', time_col_obj = 'time',
                        print_steps = TRUE, plot_rmsep = TRUE,
                        print_nbComp = TRUE,plot_reg_curves = TRUE)
#> ### Naive PLS ### 
#> => Input format assertions.
#> => Input format assertions OK.
#> => Data formatting.
#> => PLS model.

#> [1] "Optimal number of PLS components :  2"

Functional PLS

fpls_obj = funcPLS(df_list = df_cfd, Y = Y_df$Y_noised,
                   basis_obj = basis,
                   curve_type_obj = 'cat',
                   regul_time_obj = regul_time,
                   id_col_obj = 'id', time_col_obj = 'time',
                   print_steps = TRUE, plot_rmsep = TRUE, 
                   print_nbComp = TRUE, plot_reg_curves = TRUE)
#> ### Functional PLS ### 
#> => Input format assertions.
#> => Input format assertions OK.
#> => Building alpha matrix.
#> => Building curve names.
#> ==> Evaluating alpha for : CatFD_1_1.
#>  ==> Evaluating alpha for : CatFD_1_2.
#>  ==> Evaluating alpha for : CatFD_1_3.
#> => Evaluate metrix and root_metric.
#> => plsr(Y ~ trans_alphas).

#> Optimal number of PLS components :  2 .
#> => Build Intercept and regression curves for optimal number of components.
#>  ==> Build  regression curve for : CatFD_1_1
#>  ==> Build  regression curve for : CatFD_1_2
#>  ==> Build  regression curve for : CatFD_1_3

Smooth PLS

spls_obj = smoothPLS(df_list = df_cfd, Y = Y_df$Y_noised, basis_obj = basis,
                     orth_obj = TRUE, curve_type_obj = 'cat',
                     int_mode = 1, 
                     id_col_obj ='id', time_col_obj = 'time',
                     print_steps = TRUE, plot_rmsep = TRUE,
                     print_nbComp = TRUE, plot_reg_curves = TRUE)
#> ### Smooth PLS ### 
#> ## Use parallelization in case of heavy computational load. ## 
#> ## Threshold can be manualy adjusted : (default 2500) ## 
#> ## >options(SmoothPLS.parallel_threshold = 500) ## 
#> => Input format assertions.
#> => Input format assertions OK.
#> => Orthonormalize basis.
#> => Data objects formatting.
#> => Evaluate Lambda matrix.
#> ==> Lambda for : CatFD_1_state_1.
#> ==> Lambda for : CatFD_1_state_2.
#> ==> Lambda for : CatFD_1_state_3.
#> => PLSR model.
#> => Optimal number of PLS components : 2

#> => Evaluate SmoothPLS functions and <w_i, p_j> coef.
#> => Build regression functions and intercept.
#> ==> Build regression curve for : CatFD_1_state_1
#> ==> Build regression curve for : CatFD_1_state_2
#> ==> Build regression curve for : CatFD_1_state_3

names(spls_obj$reg_obj)
#> [1] "Intercept"       "CatFD_1_state_1" "CatFD_1_state_2" "CatFD_1_state_3"

Curves comparison

# Warning ms_spls_obj$delta_ms_list[[1]] is the intercept!
cat("curve_1 : smooth PLS regression curve.\n")
#> curve_1 : smooth PLS regression curve.
cat("curve_2 : functional PLS regression curve.\n")
#> curve_2 : functional PLS regression curve.
cat("curve_3 : naive PLS regression coefficients\n")
#> curve_3 : naive PLS regression coefficients

for(i in 1:N_states){
  start = 0
  print(paste0("State_", (i)))
  evaluate_curves_distances(real_f = beta_func_list[[i]],
                          regul_time = regul_time, 
                          fun_fd_list = list(spls_obj$reg_obj[[i+1]], 
                                             fpls_obj$reg_obj[[i+1]],
                                             approxfun(
                                               x = regul_time,
                                               y = naivePLS_obj$opti_reg_coef[
                                                 start:(start+length(regul_time)
                                                        )])
                                             )
                          )
  start = start + length(regul_time)
  
}
#> [1] "State_1"
#> [1] "real_f -> curve_1 / INPROD  : 47.4572000558795 / DIST : 13.2905098633588"
#> [1] "real_f -> curve_2 / INPROD  : 43.6591352214833 / DIST : 13.6320162598076"
#> [1] "real_f -> curve_3 / INPROD  : 13.4690073074092 / DIST : 27.8941820901434"
#> [1] "State_2"
#> [1] "real_f -> curve_1 / INPROD  : 22.9684162701822 / DIST : 14.1561671305604"
#> [1] "real_f -> curve_2 / INPROD  : 25.7051668331251 / DIST : 15.742963397086"
#> [1] "real_f -> curve_3 / INPROD  : 31.4665194389073 / DIST : 38.1269198992419"
#> [1] "State_3"
#> [1] "real_f -> curve_1 / INPROD  : 25.9590236019394 / DIST : 15.1535445344829"
#> [1] "real_f -> curve_2 / INPROD  : 27.0203400870638 / DIST : 11.7454038364192"
#> [1] "real_f -> curve_3 / INPROD  : -64.0495576976778 / DIST : 37.3104708298011"

for(i in 1:N_states){
  start = 0
  
  y_lim = eval_max_min_y(f_list = list(spls_obj$reg_ob[[i+1]],
                                       fpls_obj$reg_ob[[i+1]],
                                       approxfun(
                                         x = regul_time,
                                         y = naivePLS_obj$opti_reg_coef[
                                           start:(start+length(regul_time))]),
                                       beta_func_list[[i]]
                                       ), 
                         regul_time = regul_time_0)
  
  plot(regul_time_0, beta_func_list[[i]](regul_time_0), col = 'black', 
       ylim = y_lim, xlab = 'Time', ylab = 'Value', type = 'l')
  lines(regul_time_0, approxfun(x = regul_time,
                                y = naivePLS_obj$opti_reg_coef[
                                  start:(start+
                                           length(regul_time))])(regul_time_0), 
       col = 'green')
  title(paste0(names(spls_obj$reg_obj)[i+1], " regression curves"))
  plot(spls_obj$reg_obj[[i+1]], col = 'blue', add = TRUE)
  plot(fpls_obj$reg_obj[[i+1]], col = 'red', add = TRUE)
  legend("topleft",
         legend = c("Real curve", "NaivePLS coef", 
                    "SmoothPLS reg curve", "FunctionalPLS reg curve"),
         col = c("black", "green", "blue", "red"),
         lty = 1,
         lwd = 1)
  
  start = start + length(regul_time)
}

Results

train_results = data.frame(matrix(ncol = 5, nrow = 3))
colnames(train_results) = c("PRESS", "RMSE", "MAE", "R2", "var_Y")
rownames(train_results) = c("NaivePLS", "FPLS", "SmoothPLS")

test_results = train_results

print(paste0("There is ", 100*NotS_ratio, "% of noised in Y"))
#> [1] "There is 20% of noised in Y"

Train set

Y_train = Y_df$Y_noised

# Naive
Y_hat = predict(naivePLS_obj$plsr_model, 
                ncomp = naivePLS_obj$nbCP_opti, 
                newdata = naivePLS_obj$plsr_model$model$`as.matrix(df_mod_wide)`)
train_results["NaivePLS", ] = evaluate_results(Y_train, Y_hat)


# FPLS
Y_hat_fpls = (predict(fpls_obj$plsr_model, ncomp = fpls_obj$nbCP_opti,
                newdata = fpls_obj$trans_alphas) 
              + fpls_obj$reg_obj$Intercept
              + mean(Y))

Y_hat_fpls = smoothPLS_predict(df_predict_list = df_cfd,
                               delta_list = fpls_obj$reg_obj, 
                               curve_type_obj = curve_type,
                               int_mode = int_mode,
                               regul_time_obj = regul_time)

train_results["FPLS", ] = evaluate_results(Y_train, Y_hat_fpls)

# Smooth PLS
Y_hat_spls = smoothPLS_predict(df_predict_list = df_cfd,
                               delta_list = spls_obj$reg_obj, 
                               curve_type_obj = curve_type,
                               int_mode = int_mode,
                               regul_time_obj = regul_time)

train_results["SmoothPLS", ] = evaluate_results(Y_train, Y_hat_spls)

train_results["NaivePLS", "nb_cp"] = naivePLS_obj$nbCP_opti
train_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti
train_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_opti

train_results
#>              PRESS     RMSE      MAE        R2     var_Y nb_cp
#> NaivePLS  56851.26 23.84350 18.74728 0.8052915  80.52915     2
#> FPLS      51006.48 22.58462 17.48461 0.8253091  77.95544     2
#> SmoothPLS 54449.52 23.33442 18.54678 0.8135171 111.55514     2

Test set

Y_test = Y_df_test$Y_noised

# Naive
df_test_wide = naivePLS_formatting(df_list = df_test,
                                  regul_time_obj = regul_time,
                                  curve_type_obj = curve_type, 
                                  id_col_obj = 'id', time_col_obj = 'time')

Y_hat = predict(naivePLS_obj$plsr_model,
                ncomp = naivePLS_obj$nbCP_opti, 
                newdata = as.matrix(df_test_wide))
test_results["NaivePLS", ] = evaluate_results(Y_test, Y_hat)

# FPLS
Y_hat_fpls = smoothPLS_predict(df_predict_list = df_test,
                               delta_list = fpls_obj$reg_obj,
                               curve_type_obj = curve_type,
                               int_mode = int_mode, 
                               regul_time_obj = regul_time) 

test_results["FPLS", ] = evaluate_results(Y_test, Y_hat_fpls)

# Smooth PLS
Y_hat_spls = smoothPLS_predict(df_predict_list = df_test,
                               delta_list = spls_obj$reg_obj,
                               curve_type_obj = curve_type,
                               int_mode = int_mode, 
                               regul_time_obj = regul_time)

test_results["SmoothPLS", ] = evaluate_results(Y_test, Y_hat_spls)

test_results["NaivePLS", "nb_cp"] = naivePLS_obj$nbCP_opti
test_results["FPLS", "nb_cp"] = fpls_obj$nbCP_opti
test_results["SmoothPLS", "nb_cp"] = spls_obj$nbCP_opti

test_results
#>              PRESS     RMSE      MAE        R2     var_Y nb_cp
#> NaivePLS  12668.95 22.51128 17.71806 0.7918992  96.46796     2
#> FPLS      13999.53 23.66392 17.75687 0.7700429  91.67738     2
#> SmoothPLS 17130.76 26.17691 20.60632 0.7186092 135.21697     2

Plot results

train_results
#>              PRESS     RMSE      MAE        R2     var_Y nb_cp
#> NaivePLS  56851.26 23.84350 18.74728 0.8052915  80.52915     2
#> FPLS      51006.48 22.58462 17.48461 0.8253091  77.95544     2
#> SmoothPLS 54449.52 23.33442 18.54678 0.8135171 111.55514     2

test_results
#>              PRESS     RMSE      MAE        R2     var_Y nb_cp
#> NaivePLS  12668.95 22.51128 17.71806 0.7918992  96.46796     2
#> FPLS      13999.53 23.66392 17.75687 0.7700429  91.67738     2
#> SmoothPLS 17130.76 26.17691 20.60632 0.7186092 135.21697     2

plot_model_metrics_base(train_results, test_results)

plot_model_metrics_base(train_results, test_results,
                        models_to_plot = c('FPLS', 'SmoothPLS'))

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.