The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
This vignette demonstrates supervised learning capabilities in tidylearn. All methods shown here wrap established R packages - the algorithms are unchanged, tidylearn simply provides a consistent interface and tidy output.
Wrapped packages include:
lm(), glm()) for linear and
logistic regressionAccess raw model objects via model$fit for
package-specific functionality.
Let’s create a binary classification problem from the iris dataset:
# Create binary classification dataset
iris_binary <- iris %>%
filter(Species %in% c("setosa", "versicolor")) %>%
mutate(Species = droplevels(Species))
# Split data
split <- tl_split(iris_binary, prop = 0.7, stratify = "Species", seed = 123)# Train logistic regression
model_logistic <- tl_model(split$train, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model_logistic)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: logistic
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 70# Train decision tree
model_tree <- tl_model(split$train, Species ~ ., method = "tree")
print(model_tree)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: tree
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 70
# Predictions
preds_tree <- predict(model_tree, new_data = split$test)# Split full iris dataset
split_multi <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)# Train random forest
model_forest <- tl_model(split_multi$train, Species ~ ., method = "forest")
print(model_forest)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: forest
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 105# Split mtcars data
split_reg <- tl_split(mtcars, prop = 0.7, seed = 123)
# Train linear model
model_lm <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")
print(model_lm)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: linear
#> Task: Regression
#> Formula: mpg ~ wt + hp + disp
#>
#> Training observations: 22# Polynomial regression for non-linear relationships
model_poly <- tl_model(split_reg$train, mpg ~ wt, method = "polynomial", degree = 2)
print(model_poly)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: polynomial
#> Task: Regression
#> Formula: mpg ~ wt
#>
#> Training observations: 22Regularization helps prevent overfitting by adding penalties to model complexity.
# Compare multiple models
models <- list(
linear = tl_model(split_reg$train, mpg ~ ., method = "linear"),
tree = tl_model(split_reg$train, mpg ~ ., method = "tree"),
forest = tl_model(split_reg$train, mpg ~ ., method = "forest")
)# Calculate RMSE for each model
results <- data.frame(
Model = character(),
RMSE = numeric(),
stringsAsFactors = FALSE
)
for (model_name in names(models)) {
preds <- predict(models[[model_name]], new_data = split_reg$test)
rmse <- sqrt(mean((preds$.pred - split_reg$test$mpg)^2))
results <- rbind(results, data.frame(
Model = model_name,
RMSE = rmse
))
}
results <- results %>% arrange(RMSE)
print(results)
#> Model RMSE
#> 1 forest 2.046967
#> 2 linear 2.281450
#> 3 tree 4.095888# Interaction terms
model_interact <- tl_model(split_reg$train, mpg ~ wt * hp, method = "linear")
# Polynomial terms using I()
model_poly_manual <- tl_model(split_reg$train, mpg ~ wt + I(wt^2), method = "linear")
# Subset of predictors
model_subset <- tl_model(split_reg$train, mpg ~ wt + hp + disp, method = "linear")# Create dataset with categorical variables
mtcars_cat <- mtcars %>%
mutate(
cyl = as.factor(cyl),
gear = as.factor(gear),
am = as.factor(am)
)
split_cat <- tl_split(mtcars_cat, prop = 0.7, seed = 123)
# Model with categorical predictors
model_cat <- tl_model(split_cat$train, mpg ~ ., method = "forest")
print(model_cat)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: forest
#> Task: Regression
#> Formula: mpg ~ .
#>
#> Training observations: 22# Create data with missing values
mtcars_missing <- mtcars
mtcars_missing[sample(1:nrow(mtcars_missing), 5), "hp"] <- NA
mtcars_missing[sample(1:nrow(mtcars_missing), 3), "wt"] <- NA
# Preprocess to handle missing values
processed_missing <- tl_prepare_data(
mtcars_missing,
mpg ~ .,
impute_method = "mean",
scale_method = "standardize"
)
#> Imputing missing values using method: mean
#> Scaling numeric features using method: standardize
# Train model
model_imputed <- tl_model(processed_missing$data, mpg ~ ., method = "linear")tidylearn provides a unified interface for supervised learning:
tl_model()) for all methods# Complete workflow example
final_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)
final_prep <- tl_prepare_data(final_split$train, Species ~ ., scale_method = "standardize")
#> Scaling numeric features using method: standardize
final_model <- tl_model(final_prep$data, Species ~ ., method = "forest")
final_preds <- predict(final_model, new_data = final_split$test)
# Evaluate
accuracy <- mean(final_preds$.pred == final_split$test$Species)
cat("Test Accuracy:", round(accuracy * 100, 1), "%\n")
#> Test Accuracy: 33.3 %These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.