First of all, thank you for using healthyR.ai
. If you encounter issues or want to make a feature request, please visit https://github.com/spsanderson/healthyR.ai/issues
library(healthyR.ai)
#> == Welcome to healthyR.ai ======================================================
#> If you find this package useful, please leave a star: https://github.com/spsanderson/healthyR.ai
#> If you encounter a bug or want to request an enhancement please file an issue at:
#> https://github.com/spsanderson/healthyR.ai/issues
#> Thank you for using healthyR.ai!
In this should example we will showcase the pca_your_recipe()
function. This function takes only a few arguments. The arguments are currently .data
which is the full data set that gets passed internally to the recipes::bake()
function, .recipe_object
which is a recipe you have already made and want to pass to the function in order to perform the pca, and finally .threshold
which is the fraction of the variance that should be captured by the components.
To start this walk through we will first load in a few libraries.
suppressPackageStartupMessages(library(timetk))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(healthyR.data))
suppressPackageStartupMessages(library(rsample))
suppressPackageStartupMessages(library(recipes))
suppressPackageStartupMessages(library(ggplot2))
Now that we have out libraries we can go ahead and get our data set ready.
<- healthyR_data %>%
data_tbl select(visit_end_date_time) %>%
summarise_by_time(
.date_var = visit_end_date_time,
.by = "month",
value = n()
%>%
) set_names("date_col","value") %>%
filter_by_time(
.date_var = date_col,
.start_date = "2013",
.end_date = "2020"
)
head(data_tbl)
#> # A tibble: 6 x 2
#> date_col value
#> <dttm> <int>
#> 1 2013-01-01 00:00:00 2082
#> 2 2013-02-01 00:00:00 1719
#> 3 2013-03-01 00:00:00 1796
#> 4 2013-04-01 00:00:00 1865
#> 5 2013-05-01 00:00:00 2028
#> 6 2013-06-01 00:00:00 1813
The data set is simple and by itself would not be at all useful for a pca analysis since there is only one predictor, being time. In order to facilitate the use of the function and this example, we will create a splits
object and a recipe
object.
<- initial_split(data = data_tbl, prop = 0.8)
splits
splits#> <Analysis/Assess/Total>
#> <76/19/95>
head(training(splits))
#> # A tibble: 6 x 2
#> date_col value
#> <dttm> <int>
#> 1 2018-03-01 00:00:00 1618
#> 2 2020-08-01 00:00:00 1140
#> 3 2017-12-01 00:00:00 1530
#> 4 2015-10-01 00:00:00 1641
#> 5 2017-11-01 00:00:00 1530
#> 6 2015-12-01 00:00:00 1571
<- recipe(value ~ ., training(splits)) %>%
rec_obj step_timeseries_signature(date_col) %>%
step_rm(matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
rec_obj#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 1
#>
#> Operations:
#>
#> Timeseries signature features from date_col
#> Delete terms matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)")
get_juiced_data(rec_obj) %>% glimpse()
#> Rows: 76
#> Columns: 20
#> $ date_col <dttm> 2018-03-01, 2020-08-01, 2017-12-01, 2015-10-01, 20~
#> $ value <int> 1618, 1140, 1530, 1641, 1530, 1571, 1343, 1609, 153~
#> $ date_col_index.num <dbl> 1519862400, 1596240000, 1512086400, 1443657600, 150~
#> $ date_col_year <int> 2018, 2020, 2017, 2015, 2017, 2015, 2018, 2018, 201~
#> $ date_col_half <int> 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, ~
#> $ date_col_quarter <int> 1, 3, 4, 4, 4, 4, 3, 3, 3, 2, 3, 2, 4, 1, 3, 2, 3, ~
#> $ date_col_month <int> 3, 8, 12, 10, 11, 12, 9, 8, 7, 4, 9, 6, 10, 2, 7, 6~
#> $ date_col_month.lbl <ord> March, August, December, October, November, Decembe~
#> $ date_col_day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
#> $ date_col_wday <int> 5, 7, 6, 5, 4, 3, 7, 4, 7, 7, 1, 2, 4, 5, 4, 2, 3, ~
#> $ date_col_wday.lbl <ord> Thursday, Saturday, Friday, Thursday, Wednesday, Tu~
#> $ date_col_mday <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
#> $ date_col_qday <int> 60, 32, 62, 1, 32, 62, 63, 32, 1, 1, 63, 62, 1, 32,~
#> $ date_col_yday <int> 60, 214, 335, 274, 305, 335, 244, 213, 182, 91, 244~
#> $ date_col_mweek <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 6, 6, ~
#> $ date_col_week <int> 9, 31, 48, 40, 44, 48, 35, 31, 26, 13, 35, 22, 40, ~
#> $ date_col_week2 <int> 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, ~
#> $ date_col_week3 <int> 0, 1, 0, 1, 2, 0, 2, 1, 2, 1, 2, 1, 1, 2, 0, 1, 1, ~
#> $ date_col_week4 <int> 1, 3, 0, 0, 0, 0, 3, 3, 2, 1, 3, 2, 0, 1, 3, 2, 3, ~
#> $ date_col_mday7 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
Now that we have out initial recipe we can use the pca_your_recipe()
function.
<- pca_your_recipe(
pca_list .recipe_object = rec_obj,
.data = data_tbl,
.threshold = 0.8
)#> Registered S3 method overwritten by 'quantmod':
#> method from
#> as.zoo.data.frame zoo
The function returns a list object and does so insvisible
so you must assign the output to a variable, you can then access the items of the list in the usual manner.
The following items are included in the output of the function:
Lets start going down the list of items.
This is the portion you will want to output to a variable as this is the recipe object itself that you will use further down the line of your work.
<- pca_list$pca_transform
pca_rec_obj
pca_rec_obj#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 1
#>
#> Operations:
#>
#> Timeseries signature features from date_col
#> Delete terms matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)")
#> Centering for recipes::all_numeric()
#> Scaling for recipes::all_numeric()
#> Sparse, unbalanced variable filter on recipes::all_numeric()
#> No PCA components were extracted.
$variable_loadings
pca_list#> # A tibble: 169 x 4
#> terms value component id
#> <chr> <dbl> <chr> <chr>
#> 1 date_col_index.num -0.0322 PC1 pca_deuzL
#> 2 date_col_year 0.0197 PC1 pca_deuzL
#> 3 date_col_half -0.388 PC1 pca_deuzL
#> 4 date_col_quarter -0.435 PC1 pca_deuzL
#> 5 date_col_month -0.437 PC1 pca_deuzL
#> 6 date_col_wday -0.0207 PC1 pca_deuzL
#> 7 date_col_qday -0.0387 PC1 pca_deuzL
#> 8 date_col_yday -0.437 PC1 pca_deuzL
#> 9 date_col_mweek 0.0372 PC1 pca_deuzL
#> 10 date_col_week -0.438 PC1 pca_deuzL
#> # ... with 159 more rows
$variable_variance
pca_list#> # A tibble: 52 x 4
#> terms value component id
#> <chr> <dbl> <int> <chr>
#> 1 variance 5.13 1 pca_deuzL
#> 2 variance 2.02 2 pca_deuzL
#> 3 variance 1.51 3 pca_deuzL
#> 4 variance 1.44 4 pca_deuzL
#> 5 variance 1.14 5 pca_deuzL
#> 6 variance 0.645 6 pca_deuzL
#> 7 variance 0.591 7 pca_deuzL
#> 8 variance 0.469 8 pca_deuzL
#> 9 variance 0.0602 9 pca_deuzL
#> 10 variance 0.000261 10 pca_deuzL
#> # ... with 42 more rows
$pca_estimates
pca_list#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 1
#>
#> Training data contained 76 data points and no missing data.
#>
#> Operations:
#>
#> Timeseries signature features from date_col [trained]
#> Variables removed date_col_year.iso, date_col_month.xts, ... [trained]
#> Centering for value, date_col_index.num, ... [trained]
#> Scaling for value, date_col_index.num, ... [trained]
#> Sparse, unbalanced variable filter removed date_col_day, ... [trained]
#> PCA extraction with date_col_index.num, date_col_year, ... [trained]
$pca_juiced_estimates %>% glimpse()
pca_list#> Rows: 76
#> Columns: 9
#> $ date_col <dttm> 2018-03-01, 2020-08-01, 2017-12-01, 2015-10-01, 20~
#> $ value <dbl> 0.305149153, -1.356838616, -0.000823491, 0.38511927~
#> $ date_col_month.lbl <ord> March, August, December, October, November, Decembe~
#> $ date_col_wday.lbl <ord> Thursday, Saturday, Friday, Thursday, Wednesday, Tu~
#> $ PC1 <dbl> 2.88815748, -0.56941169, -3.24751175, -2.47352658, ~
#> $ PC2 <dbl> -1.015794586, -2.319313350, -0.559802115, 1.0726772~
#> $ PC3 <dbl> -1.523467249, 0.028258251, -2.016014883, -0.7272953~
#> $ PC4 <dbl> 0.39372895, 1.72351394, 0.01768918, -1.16862848, -0~
#> $ PC5 <dbl> 0.7367275, -0.9448078, 0.9034308, -0.9429968, -0.61~
$pca_baked_data %>% glimpse()
pca_list#> Rows: 95
#> Columns: 9
#> $ date_col <dttm> 2013-01-01, 2013-02-01, 2013-03-01, 2013-04-01, 20~
#> $ value <dbl> 1.9184595, 0.6563223, 0.9240484, 1.1639587, 1.73070~
#> $ date_col_month.lbl <ord> January, February, March, April, May, June, July, A~
#> $ date_col_wday.lbl <ord> Tuesday, Friday, Friday, Monday, Wednesday, Saturda~
#> $ PC1 <dbl> 3.7625319, 3.0954942, 2.9045027, 2.2249651, 1.38488~
#> $ PC2 <dbl> 2.590194, 2.152537, 1.910745, 2.547991, 2.218156, 1~
#> $ PC3 <dbl> 1.19844108, -0.49337486, -2.01916451, 1.43188816, -~
#> $ PC4 <dbl> -0.97076717, 0.67257483, 0.99609414, -0.99801117, -~
#> $ PC5 <dbl> -0.15219492, -1.54840667, 0.47164908, 0.28057197, 0~
$pca_rotation_df %>% glimpse()
pca_list#> Rows: 13
#> Columns: 13
#> $ PC1 <dbl> -0.03217120, 0.01968948, -0.38822955, -0.43492801, -0.43719403, -~
#> $ PC2 <dbl> -0.694421233, -0.694509433, -0.002658359, 0.018275720, -0.0106049~
#> $ PC3 <dbl> 0.067435828, 0.068483919, 0.194360959, 0.048889978, -0.009062772,~
#> $ PC4 <dbl> -0.10236226, -0.11087859, 0.16883438, -0.05345862, 0.06977494, 0.~
#> $ PC5 <dbl> 0.003246377, -0.006108173, -0.166537687, -0.049512137, 0.08069837~
#> $ PC6 <dbl> -0.004708018, -0.002759762, -0.285242923, -0.140142302, -0.019249~
#> $ PC7 <dbl> 0.007740802, 0.007384155, -0.058896369, 0.033971238, 0.004542796,~
#> $ PC8 <dbl> 0.03710162, 0.04314412, -0.06656222, 0.01265603, -0.04678856, -0.~
#> $ PC9 <dbl> 0.009932605, -0.015647939, -0.816054745, 0.268366674, 0.213903239~
#> $ PC10 <dbl> 0.0124013618, -0.0110293023, 0.0058390765, 0.2984999440, 0.368052~
#> $ PC11 <dbl> -0.0229852903, 0.0239838979, -0.0010033441, 0.0547968775, 0.61100~
#> $ PC12 <dbl> 9.880721e-03, -9.482604e-03, 3.379946e-03, 7.852100e-01, -4.90061~
#> $ PC13 <dbl> 7.066619e-01, -7.051896e-01, -8.574273e-05, -3.420034e-02, -1.845~
$pca_variance_df %>% glimpse()
pca_list#> Rows: 13
#> Columns: 6
#> $ PC <chr> "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8"~
#> $ var_explained <dbl> 3.948735e-01, 1.550608e-01, 1.163611e-01, 1.104024e-01~
#> $ var_pct_txt <chr> "39.49%", "15.51%", "11.64%", "11.04%", "8.75%", "4.96~
#> $ cum_var_pct <dbl> 0.3948735, 0.5499343, 0.6662954, 0.7766977, 0.8642145,~
#> $ cum_var_pct_txt <chr> "39.49%", "54.99%", "66.63%", "77.67%", "86.42%", "91.~
#> $ ou_threshold <fct> Under, Under, Under, Under, Over, Over, Over, Over, Ov~
$pca_variance_scree_plt pca_list