SuperML R package is designed to unify the model training process in R like Python. Generally, it’s seen that people spend lot of time in searching for packages, figuring out the syntax for training machine learning models in R. This behaviour is highly apparent in users who frequently switch between R and Python. This package provides a python´s scikit-learn interface (fit
, predict
) to train models faster.
In addition to building machine learning models, there are handy functionalities to do feature engineering
This ambitious package is my ongoing effort to help the r-community build ML models easily and faster in R.
You can install latest cran version using (recommended):
install.packages("superml")
You can install the developmemt version directly from github using:
devtools::install_github("saraswatmks/superml")
This package uses existing r-packages to build machine learning model. In this tutorial, we’ll use data.table R package to do all tasks related to data manipulation.
We’ll quickly prepare the data set to be ready to served for model training.
load("../data/reg_train.rda")
# if the above doesn't work, you can try: load("reg_train.rda")
library(data.table)
library(caret)
#> Loading required package: lattice
#> Loading required package: ggplot2
library(superml)
#> Loading required package: R6
library(Metrics)
#>
#> Attaching package: 'Metrics'
#> The following objects are masked from 'package:caret':
#>
#> precision, recall
head(reg_train)
#> Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
#> 1: 1 60 RL 65 8450 Pave <NA> Reg
#> 2: 2 20 RL 80 9600 Pave <NA> Reg
#> 3: 3 60 RL 68 11250 Pave <NA> IR1
#> 4: 4 70 RL 60 9550 Pave <NA> IR1
#> 5: 5 60 RL 84 14260 Pave <NA> IR1
#> 6: 6 50 RL 85 14115 Pave <NA> IR1
#> LandContour Utilities LotConfig LandSlope Neighborhood Condition1
#> 1: Lvl AllPub Inside Gtl CollgCr Norm
#> 2: Lvl AllPub FR2 Gtl Veenker Feedr
#> 3: Lvl AllPub Inside Gtl CollgCr Norm
#> 4: Lvl AllPub Corner Gtl Crawfor Norm
#> 5: Lvl AllPub FR2 Gtl NoRidge Norm
#> 6: Lvl AllPub Inside Gtl Mitchel Norm
#> Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
#> 1: Norm 1Fam 2Story 7 5 2003
#> 2: Norm 1Fam 1Story 6 8 1976
#> 3: Norm 1Fam 2Story 7 5 2001
#> 4: Norm 1Fam 2Story 7 5 1915
#> 5: Norm 1Fam 2Story 8 5 2000
#> 6: Norm 1Fam 1.5Fin 5 5 1993
#> YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
#> 1: 2003 Gable CompShg VinylSd VinylSd BrkFace
#> 2: 1976 Gable CompShg MetalSd MetalSd None
#> 3: 2002 Gable CompShg VinylSd VinylSd BrkFace
#> 4: 1970 Gable CompShg Wd Sdng Wd Shng None
#> 5: 2000 Gable CompShg VinylSd VinylSd BrkFace
#> 6: 1995 Gable CompShg VinylSd VinylSd None
#> MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
#> 1: 196 Gd TA PConc Gd TA
#> 2: 0 TA TA CBlock Gd TA
#> 3: 162 Gd TA PConc Gd TA
#> 4: 0 TA TA BrkTil TA Gd
#> 5: 350 Gd TA PConc Gd TA
#> 6: 0 TA TA Wood Gd TA
#> BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
#> 1: No GLQ 706 Unf 0 150
#> 2: Gd ALQ 978 Unf 0 284
#> 3: Mn GLQ 486 Unf 0 434
#> 4: No ALQ 216 Unf 0 540
#> 5: Av GLQ 655 Unf 0 490
#> 6: No GLQ 732 Unf 0 64
#> TotalBsmtSF Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF
#> 1: 856 GasA Ex Y SBrkr 856 854
#> 2: 1262 GasA Ex Y SBrkr 1262 0
#> 3: 920 GasA Ex Y SBrkr 920 866
#> 4: 756 GasA Gd Y SBrkr 961 756
#> 5: 1145 GasA Ex Y SBrkr 1145 1053
#> 6: 796 GasA Ex Y SBrkr 796 566
#> LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
#> 1: 0 1710 1 0 2 1
#> 2: 0 1262 0 1 2 0
#> 3: 0 1786 1 0 2 1
#> 4: 0 1717 1 0 1 0
#> 5: 0 2198 1 0 2 1
#> 6: 0 1362 1 0 1 1
#> BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
#> 1: 3 1 Gd 8 Typ
#> 2: 3 1 TA 6 Typ
#> 3: 3 1 Gd 6 Typ
#> 4: 3 1 Gd 7 Typ
#> 5: 4 1 Gd 9 Typ
#> 6: 1 1 TA 5 Typ
#> Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
#> 1: 0 <NA> Attchd 2003 RFn 2
#> 2: 1 TA Attchd 1976 RFn 2
#> 3: 1 TA Attchd 2001 RFn 2
#> 4: 1 Gd Detchd 1998 Unf 3
#> 5: 1 TA Attchd 2000 RFn 3
#> 6: 0 <NA> Attchd 1993 Unf 2
#> GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
#> 1: 548 TA TA Y 0 61
#> 2: 460 TA TA Y 298 0
#> 3: 608 TA TA Y 0 42
#> 4: 642 TA TA Y 0 35
#> 5: 836 TA TA Y 192 84
#> 6: 480 TA TA Y 40 30
#> EnclosedPorch 3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
#> 1: 0 0 0 0 <NA> <NA> <NA>
#> 2: 0 0 0 0 <NA> <NA> <NA>
#> 3: 0 0 0 0 <NA> <NA> <NA>
#> 4: 272 0 0 0 <NA> <NA> <NA>
#> 5: 0 0 0 0 <NA> <NA> <NA>
#> 6: 0 320 0 0 <NA> MnPrv Shed
#> MiscVal MoSold YrSold SaleType SaleCondition SalePrice
#> 1: 0 2 2008 WD Normal 208500
#> 2: 0 5 2007 WD Normal 181500
#> 3: 0 9 2008 WD Normal 223500
#> 4: 0 2 2006 WD Abnorml 140000
#> 5: 0 12 2008 WD Normal 250000
#> 6: 700 10 2009 WD Normal 143000
split <- createDataPartition(y = reg_train$SalePrice, p = 0.7)
xtrain <- reg_train[split$Resample1]
xtest <- reg_train[!split$Resample1]
# remove features with 90% or more missing values
# we will also remove the Id column because it doesn't contain
# any useful information
na_cols <- colSums(is.na(xtrain)) / nrow(xtrain)
na_cols <- names(na_cols[which(na_cols > 0.9)])
xtrain[, c(na_cols, "Id") := NULL]
xtest[, c(na_cols, "Id") := NULL]
# encode categorical variables
cat_cols <- names(xtrain)[sapply(xtrain, is.character)]
for(c in cat_cols){
lbl <- LabelEncoder$new()
lbl$fit(c(xtrain[[c]], xtest[[c]]))
xtrain[[c]] <- lbl$transform(xtrain[[c]])
xtest[[c]] <- lbl$transform(xtest[[c]])
}
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
#> The data contains NA values. Imputing NA with 'NA'
# removing noise column
noise <- c('GrLivArea','TotalBsmtSF')
xtrain[, c(noise) := NULL]
xtest[, c(noise) := NULL]
# fill missing value with -1
xtrain[is.na(xtrain)] <- -1
xtest[is.na(xtest)] <- -1
KNN Regression
knn <- KNNTrainer$new(k = 2,prob = T,type = 'reg')
knn$fit(train = xtrain, test = xtest, y = 'SalePrice')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type='raw')
rmse(actual = xtest$SalePrice, predicted=labels)
#> [1] 7955.711
SVM Regression
#predicts probabilities - must specify mc_type ("OvA_hinge", "AvA_hinge")
svm <- SVMTrainer$new(type="ls")
svm$fit(xtrain, 'SalePrice')
#> Removing invalid columns. The names should not start with anumber: 1stFlrSF,2ndFlrSF,3SsnPorch
pred <- svm$predict(xtest)
rmse(actual = xtest$SalePrice, predicted = pred)
#> [1] 39150.87
Simple Regresison
lf <- LMTrainer$new(family="gaussian")
lf$fit(X = xtrain, y = "SalePrice")
summary(lf$model)
#>
#> Call:
#> stats::glm(formula = f, family = self$family, data = X, weights = self$weights)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -351050 -13393 -520 12385 231188
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -3.571e+05 1.459e+06 -0.245 0.806689
#> MSSubClass -1.225e+02 5.848e+01 -2.094 0.036525 *
#> MSZoning -6.997e+02 1.388e+03 -0.504 0.614194
#> LotFrontage -3.533e+01 3.029e+01 -1.166 0.243709
#> LotArea 4.195e-01 1.668e-01 2.515 0.012056 *
#> Street -3.691e+04 1.501e+04 -2.459 0.014112 *
#> LotShape 8.612e+02 1.910e+03 0.451 0.652093
#> LandContour 2.841e+03 2.087e+03 1.361 0.173735
#> Utilities -4.209e+04 3.170e+04 -1.328 0.184562
#> LotConfig 8.873e+02 9.951e+02 0.892 0.372844
#> LandSlope 4.966e+03 4.621e+03 1.075 0.282831
#> Neighborhood -5.274e+02 1.859e+02 -2.836 0.004663 **
#> Condition1 -1.622e+03 7.264e+02 -2.233 0.025806 *
#> Condition2 6.318e+02 3.276e+03 0.193 0.847129
#> BldgType -1.858e+02 2.465e+03 -0.075 0.939921
#> HouseStyle 5.996e+02 1.019e+03 0.588 0.556394
#> OverallQual 1.343e+04 1.297e+03 10.358 < 2e-16 ***
#> OverallCond 6.020e+03 1.144e+03 5.264 1.75e-07 ***
#> YearBuilt 4.250e+02 7.589e+01 5.600 2.80e-08 ***
#> YearRemodAdd 7.071e+01 7.514e+01 0.941 0.346926
#> RoofStyle 1.013e+04 1.900e+03 5.334 1.20e-07 ***
#> RoofMatl -2.592e+04 3.029e+03 -8.556 < 2e-16 ***
#> Exterior1st -1.345e+03 5.966e+02 -2.255 0.024342 *
#> Exterior2nd 1.954e+03 5.661e+02 3.452 0.000581 ***
#> MasVnrType 2.468e+03 1.487e+03 1.660 0.097338 .
#> MasVnrArea 1.114e+01 6.720e+00 1.657 0.097805 .
#> ExterQual -5.931e+02 2.301e+03 -0.258 0.796675
#> ExterCond -7.290e+02 2.224e+03 -0.328 0.743166
#> Foundation -4.696e+03 1.860e+03 -2.525 0.011720 *
#> BsmtQual 4.516e+03 1.360e+03 3.321 0.000931 ***
#> BsmtCond -1.300e+03 1.736e+03 -0.749 0.454250
#> BsmtExposure 1.575e+03 8.701e+02 1.810 0.070630 .
#> BsmtFinType1 -7.787e+02 7.434e+02 -1.048 0.295112
#> BsmtFinSF1 8.919e+00 5.293e+00 1.685 0.092332 .
#> BsmtFinType2 -1.191e+03 1.009e+03 -1.180 0.238130
#> BsmtFinSF2 1.285e+01 1.015e+01 1.266 0.205791
#> BsmtUnfSF 4.192e+00 5.054e+00 0.829 0.407067
#> Heating -1.614e+03 3.663e+03 -0.441 0.659529
#> HeatingQC -1.787e+03 1.298e+03 -1.377 0.168929
#> CentralAir 3.371e+03 4.932e+03 0.684 0.494407
#> Electrical 2.062e+03 2.084e+03 0.989 0.322800
#> `1stFlrSF` 5.752e+01 6.736e+00 8.539 < 2e-16 ***
#> `2ndFlrSF` 4.945e+01 5.640e+00 8.767 < 2e-16 ***
#> LowQualFinSF -3.087e+00 1.976e+01 -0.156 0.875908
#> BsmtFullBath 1.325e+04 2.731e+03 4.852 1.43e-06 ***
#> BsmtHalfBath -1.603e+03 4.276e+03 -0.375 0.707892
#> FullBath 6.742e+03 2.975e+03 2.266 0.023648 *
#> HalfBath 2.320e+03 2.809e+03 0.826 0.409101
#> BedroomAbvGr -5.218e+03 1.788e+03 -2.918 0.003605 **
#> KitchenAbvGr -2.552e+04 5.774e+03 -4.420 1.10e-05 ***
#> KitchenQual 8.469e+03 1.746e+03 4.850 1.44e-06 ***
#> TotRmsAbvGrd 2.146e+03 1.321e+03 1.624 0.104603
#> Functional -3.791e+03 1.268e+03 -2.989 0.002872 **
#> Fireplaces -1.359e+03 2.459e+03 -0.553 0.580623
#> FireplaceQu 4.760e+03 1.297e+03 3.670 0.000256 ***
#> GarageType 7.107e+02 1.226e+03 0.580 0.562353
#> GarageYrBlt -7.794e+00 5.285e+00 -1.475 0.140601
#> GarageFinish 2.052e+03 1.380e+03 1.487 0.137271
#> GarageCars 1.682e+04 3.248e+03 5.178 2.73e-07 ***
#> GarageArea -7.396e+00 1.081e+01 -0.684 0.493923
#> GarageQual 5.489e+03 3.239e+03 1.694 0.090500 .
#> GarageCond -3.802e+03 3.339e+03 -1.139 0.255111
#> PavedDrive -1.461e+03 3.024e+03 -0.483 0.629067
#> WoodDeckSF 2.576e+01 8.695e+00 2.963 0.003122 **
#> OpenPorchSF 3.569e+01 1.689e+01 2.113 0.034826 *
#> EnclosedPorch 2.951e+01 1.828e+01 1.615 0.106748
#> `3SsnPorch` 6.283e+00 3.208e+01 0.196 0.844766
#> ScreenPorch 3.986e+01 1.824e+01 2.186 0.029062 *
#> PoolArea 1.795e+01 2.450e+01 0.732 0.464072
#> Fence -1.996e+03 1.139e+03 -1.752 0.080135 .
#> MiscVal -3.626e+00 4.957e+00 -0.732 0.464592
#> MoSold 5.242e+02 3.587e+02 1.461 0.144317
#> YrSold -3.306e+02 7.279e+02 -0.454 0.649811
#> SaleType 2.217e+03 1.140e+03 1.944 0.052215 .
#> SaleCondition 3.923e+02 1.278e+03 0.307 0.759016
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for gaussian family taken to be 857045073)
#>
#> Null deviance: 5.8961e+12 on 1023 degrees of freedom
#> Residual deviance: 8.1334e+11 on 949 degrees of freedom
#> AIC: 24043
#>
#> Number of Fisher Scoring iterations: 2
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 42543.57
Lasso Regression
lf <- LMTrainer$new(family = "gaussian", alpha=1, lambda = 1000)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 44811.36
Ridge Regression
lf <- LMTrainer$new(family = "gaussian", alpha=0)
lf$fit(X = xtrain, y = "SalePrice")
predictions <- lf$predict(df = xtest)
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 45068.47
Logistic Regression with CV
lf <- LMTrainer$new(family = "gaussian")
lf$cv_model(X = xtrain, y = 'SalePrice', nfolds = 5, parallel = FALSE)
#> Computation done.
predictions <- lf$cv_predict(df = xtest)
coefs <- lf$get_importance()
rmse(actual = xtest$SalePrice, predicted = predictions)
#> [1] 52081.12
Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 0)
rf$fit(X = xtrain, y = "SalePrice")
pred <- rf$predict(df = xtest)
rf$get_importance()
#> tmp.order.tmp..decreasing...TRUE..
#> OverallQual 777029590539
#> GarageCars 486924551594
#> GarageArea 437459061046
#> 1stFlrSF 398940884465
#> YearBuilt 353217593164
#> GarageYrBlt 287282305565
#> FullBath 242838168546
#> BsmtFinSF1 197903119378
#> ExterQual 188930354367
#> LotArea 169167289090
#> TotRmsAbvGrd 166392646455
#> YearRemodAdd 162960595639
#> 2ndFlrSF 155896350979
#> FireplaceQu 147383561214
#> Fireplaces 145202320673
#> KitchenQual 115559879535
#> MasVnrArea 101114402440
#> Foundation 99595523763
#> LotFrontage 84714106948
#> BsmtQual 79228168082
#> BsmtFinType1 77848264383
#> OpenPorchSF 76173477125
#> BsmtUnfSF 68860075715
#> WoodDeckSF 65289339515
#> Neighborhood 59822232929
#> Exterior2nd 55097837654
#> HeatingQC 47003957553
#> BedroomAbvGr 42464358029
#> GarageType 41329502292
#> MSSubClass 37310483254
#> MoSold 31551115648
#> OverallCond 31098731619
#> Exterior1st 30844200956
#> RoofStyle 29970236311
#> HouseStyle 27073576461
#> HalfBath 26493076533
#> BsmtFullBath 26482006341
#> PoolArea 24130791765
#> GarageFinish 23069021802
#> Fence 22608569549
#> SaleCondition 18240876259
#> MSZoning 17807035283
#> BsmtExposure 17705393977
#> YrSold 17582196103
#> LotShape 16550764236
#> MasVnrType 13828215184
#> SaleType 13786715152
#> LotConfig 13237342187
#> BldgType 11577780865
#> LandContour 9669210656
#> GarageCond 9307415911
#> EnclosedPorch 8611668023
#> CentralAir 8434865017
#> GarageQual 8018729456
#> KitchenAbvGr 7992065587
#> RoofMatl 7827604887
#> ScreenPorch 7430499029
#> BsmtCond 7069996004
#> ExterCond 6973108572
#> Condition1 6772402174
#> BsmtFinSF2 5980274444
#> PavedDrive 5451808841
#> LandSlope 5375662363
#> Functional 4847109814
#> BsmtFinType2 4668972871
#> Electrical 3598993357
#> BsmtHalfBath 2507052661
#> MiscVal 2140035430
#> 3SsnPorch 1013149382
#> Condition2 939211208
#> Street 869266473
#> Heating 757126794
#> LowQualFinSF 742371337
#> Utilities 21079527
rmse(actual = xtest$SalePrice, predicted = pred)
#> [1] 38137.25
Xgboost
xgb <- XGBTrainer$new(objective = "reg:linear"
, n_estimators = 500
, eval_metric = "rmse"
, maximize = F
, learning_rate = 0.1
,max_depth = 6)
xgb$fit(X = xtrain, y = "SalePrice", valid = xtest)
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:176829.937500 val-rmse:184763.828125
#> [51] train-rmse:8563.614258 val-rmse:37722.960938
#> [101] train-rmse:5134.214844 val-rmse:36804.046875
#> [151] train-rmse:3201.090332 val-rmse:36658.128906
#> [201] train-rmse:2054.285645 val-rmse:36623.574219
#> [251] train-rmse:1384.464966 val-rmse:36616.812500
#> [301] train-rmse:963.443542 val-rmse:36598.050781
#> [351] train-rmse:640.461548 val-rmse:36606.914062
#> [401] train-rmse:444.808289 val-rmse:36604.488281
#> [451] train-rmse:304.764740 val-rmse:36612.445312
#> [500] train-rmse:214.617828 val-rmse:36608.250000
pred <- xgb$predict(xtest)
rmse(actual = xtest$SalePrice, predicted = pred)
#> [1] 36608.25
Grid Search
xgb <- XGBTrainer$new(objective="reg:linear")
gst <-GridSearchCV$new(trainer = xgb,
parameters = list(n_estimators = c(10,50), max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'))
gst$fit(xtrain, "SalePrice")
#> [1] "entering grid search"
#> [1] "In total, 4 models will be trained"
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:139353.812500
#> [10] train-rmse:14108.108398
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:139829.421875
#> [10] train-rmse:16228.581055
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:141289.140625
#> [10] train-rmse:15962.292969
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:139353.812500
#> [50] train-rmse:3111.119385
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:139829.421875
#> [50] train-rmse:4170.987305
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:141289.140625
#> [50] train-rmse:3926.346924
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:139886.109375
#> [10] train-rmse:27032.160156
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:140684.515625
#> [10] train-rmse:29519.238281
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:142074.062500
#> [10] train-rmse:30981.111328
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:139886.109375
#> [50] train-rmse:16118.603516
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:140684.515625
#> [50] train-rmse:17320.564453
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-rmse:142074.062500
#> [50] train-rmse:16859.873047
gst$best_iteration()
#> $n_estimators
#> [1] 10
#>
#> $max_depth
#> [1] 5
#>
#> $accuracy_avg
#> [1] 0
#>
#> $accuracy_sd
#> [1] 0
#>
#> $auc_avg
#> [1] NaN
#>
#> $auc_sd
#> [1] NA
Random Search
rf <- RFTrainer$new()
rst <-RandomSearchCV$new(trainer = rf,
parameters = list(n_estimators = c(10,50),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter=3)
rst$fit(xtrain, "SalePrice")
#> [1] "In total, 3 models will be trained"
rst$best_iteration()
#> $n_estimators
#> [1] 50
#>
#> $max_depth
#> [1] 2
#>
#> $accuracy_avg
#> [1] 0.01756106
#>
#> $accuracy_sd
#> [1] 0.01272758
#>
#> $auc_avg
#> [1] NaN
#>
#> $auc_sd
#> [1] NA
Here, we will solve a simple binary classification problem (predict people who survived on titanic ship). The idea here is to demonstrate how to use this package to solve classification problems.
Data Preparation
# load class
load('../data/cla_train.rda')
# if the above doesn't work, you can try: load("cla_train.rda")
head(cla_train)
#> PassengerId Survived Pclass
#> 1: 1 0 3
#> 2: 2 1 1
#> 3: 3 1 3
#> 4: 4 1 1
#> 5: 5 0 3
#> 6: 6 0 3
#> Name Sex Age SibSp
#> 1: Braund, Mr. Owen Harris male 22 1
#> 2: Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
#> 3: Heikkinen, Miss. Laina female 26 0
#> 4: Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
#> 5: Allen, Mr. William Henry male 35 0
#> 6: Moran, Mr. James male NA 0
#> Parch Ticket Fare Cabin Embarked
#> 1: 0 A/5 21171 7.2500 S
#> 2: 0 PC 17599 71.2833 C85 C
#> 3: 0 STON/O2. 3101282 7.9250 S
#> 4: 0 113803 53.1000 C123 S
#> 5: 0 373450 8.0500 S
#> 6: 0 330877 8.4583 Q
# split the data
split <- createDataPartition(y = cla_train$Survived,p = 0.7)
xtrain <- cla_train[split$Resample1]
xtest <- cla_train[!split$Resample1]
# encode categorical variables - shorter way
for(c in c('Embarked','Sex','Cabin')){
lbl <- LabelEncoder$new()
lbl$fit(c(xtrain[[c]], xtest[[c]]))
xtrain[[c]] <- lbl$transform(xtrain[[c]])
xtest[[c]] <- lbl$transform(xtest[[c]])
}
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
#> The data contains blank values. Imputing them with 'NA'
# impute missing values
xtrain[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
xtest[, Age := replace(Age, is.na(Age), median(Age, na.rm = T))]
# drop these features
to_drop <- c('PassengerId','Ticket','Name')
xtrain <- xtrain[,-c(to_drop), with=F]
xtest <- xtest[,-c(to_drop), with=F]
Now, our data is ready to be served for model training. Let’s do it.
KNN Classification
knn <- KNNTrainer$new(k = 2,prob = T,type = 'class')
knn$fit(train = xtrain, test = xtest, y = 'Survived')
probs <- knn$predict(type = 'prob')
labels <- knn$predict(type='raw')
auc(actual = xtest$Survived, predicted=labels)
#> [1] 0.6776491
Naive Bayes Classification
nb <- NBTrainer$new()
nb$fit(xtrain, 'Survived')
pred <- nb$predict(xtest)
auc(actual = xtest$Survived, predicted=pred)
#> [1] 0.710828
SVM Classification
#predicts probabilities - must specify mc_type ("OvA_hinge", "AvA_hinge")
svm <- SVMTrainer$new(predict.prob = T, type="bc", mc_type="OvA_hinge")
svm$fit(xtrain, 'Survived')
pred <- svm$predict(xtest)
auc(actual = xtest$Survived, predicted=pred[,2])
#> [1] 0.784916
#predicts labels
svm <- SVMTrainer$new(predict.prob = F, type="bc")
svm$fit(xtrain, 'Survived')
pred <- svm$predict(xtest)
auc(actual = xtest$Survived, predicted=pred)
#> [1] 0.7381008
Logistic Regression
lf <- LMTrainer$new(family="binomial")
lf$fit(X = xtrain, y = "Survived")
summary(lf$model)
#>
#> Call:
#> stats::glm(formula = f, family = self$family, data = X, weights = self$weights)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.0647 -0.5139 -0.3550 0.5659 2.5979
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 1.882835 0.667638 2.820 0.00480 **
#> Pclass -0.991285 0.198153 -5.003 5.66e-07 ***
#> Sex 3.014839 0.250533 12.034 < 2e-16 ***
#> Age -0.050270 0.010402 -4.833 1.35e-06 ***
#> SibSp -0.376242 0.132598 -2.837 0.00455 **
#> Parch -0.137521 0.146524 -0.939 0.34796
#> Fare 0.001671 0.002794 0.598 0.54981
#> Cabin 0.017868 0.005923 3.017 0.00256 **
#> Embarked 0.076637 0.148818 0.515 0.60657
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 823.56 on 623 degrees of freedom
#> Residual deviance: 495.21 on 615 degrees of freedom
#> AIC: 513.21
#>
#> Number of Fisher Scoring iterations: 5
predictions <- lf$predict(df = xtest)
auc(actual = xtest$Survived, predicted = predictions)
#> [1] 0.7930805
Lasso Logistic Regression
lf <- LMTrainer$new(family="binomial", alpha=1)
lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
#> Computation done.
pred <- lf$cv_predict(df = xtest)
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.7981181
Ridge Logistic Regression
lf <- LMTrainer$new(family="binomial", alpha=0)
lf$cv_model(X = xtrain, y = "Survived", nfolds = 5, parallel = FALSE)
#> Computation done.
pred <- lf$cv_predict(df = xtest)
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.7937464
Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 3)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()
#> tmp.order.tmp..decreasing...TRUE..
#> Sex 83.179331
#> Fare 49.530245
#> Age 44.026986
#> Cabin 27.806777
#> Pclass 22.210427
#> SibSp 13.742906
#> Parch 9.837351
#> Embarked 6.777296
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.7512739
Xgboost
xgb <- XGBTrainer$new(objective = "binary:logistic"
, n_estimators = 500
, eval_metric = "auc"
, maximize = T
, learning_rate = 0.1
,max_depth = 6)
xgb$fit(X = xtrain, y = "Survived", valid = xtest)
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-auc:0.910654 val-auc:0.815229
#> [51] train-auc:0.977882 val-auc:0.803243
#> [101] train-auc:0.990142 val-auc:0.808280
#> [151] train-auc:0.994508 val-auc:0.807354
#> [201] train-auc:0.996520 val-auc:0.809352
#> [251] train-auc:0.997454 val-auc:0.809496
#> [301] train-auc:0.998147 val-auc:0.808309
#> [351] train-auc:0.998554 val-auc:0.808136
#> [401] train-auc:0.998796 val-auc:0.809120
#> [451] train-auc:0.999060 val-auc:0.809699
#> [500] train-auc:0.999104 val-auc:0.809699
pred <- xgb$predict(xtest)
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.8096989
Grid Search
xgb <- XGBTrainer$new(objective="binary:logistic")
gst <-GridSearchCV$new(trainer = xgb,
parameters = list(n_estimators = c(10,50),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'))
gst$fit(xtrain, "Survived")
#> [1] "entering grid search"
#> [1] "In total, 4 models will be trained"
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.129808
#> [10] train-error:0.098558
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.105769
#> [10] train-error:0.088942
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.098558
#> [10] train-error:0.069712
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.129808
#> [50] train-error:0.038462
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.105769
#> [50] train-error:0.036058
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.098558
#> [50] train-error:0.036058
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.177885
#> [10] train-error:0.153846
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.201923
#> [10] train-error:0.137019
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.182692
#> [10] train-error:0.115385
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.177885
#> [50] train-error:0.110577
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.201923
#> [50] train-error:0.096154
#> converting the data into xgboost format..
#> starting with training...
#> [1] train-error:0.182692
#> [50] train-error:0.081731
gst$best_iteration()
#> $n_estimators
#> [1] 10
#>
#> $max_depth
#> [1] 5
#>
#> $accuracy_avg
#> [1] 0
#>
#> $accuracy_sd
#> [1] 0
#>
#> $auc_avg
#> [1] 0.883034
#>
#> $auc_sd
#> [1] 0.0242347
Random Search
rf <- RFTrainer$new()
rst <-RandomSearchCV$new(trainer = rf,
parameters = list(n_estimators = c(10,50),
max_depth = c(5,2)),
n_folds = 3,
scoring = c('accuracy','auc'),
n_iter = 3)
rst$fit(xtrain, "Survived")
#> [1] "In total, 3 models will be trained"
rst$best_iteration()
#> $n_estimators
#> [1] 50
#>
#> $max_depth
#> [1] 5
#>
#> $accuracy_avg
#> [1] 0.849359
#>
#> $accuracy_sd
#> [1] 0.0264787
#>
#> $auc_avg
#> [1] 0.8279856
#>
#> $auc_sd
#> [1] 0.02242134
Let’s create some new feature based on target variable using target encoding and test a model.
# add target encoding features
xtrain[, feat_01 := smoothMean(train_df = xtrain,
test_df = xtest,
colname = "Embarked",
target = "Survived")$train[[2]]]
xtest[, feat_01 := smoothMean(train_df = xtrain,
test_df = xtest,
colname = "Embarked",
target = "Survived")$test[[2]]]
# train a random forest
# Random Forest
rf <- RFTrainer$new(n_estimators = 500,classification = 1, max_features = 4)
rf$fit(X = xtrain, y = "Survived")
pred <- rf$predict(df = xtest)
rf$get_importance()
#> tmp.order.tmp..decreasing...TRUE..
#> Sex 85.213569
#> Fare 51.676287
#> Age 47.071256
#> Cabin 28.804936
#> Pclass 22.431287
#> SibSp 13.735815
#> Parch 9.643044
#> feat_01 4.449812
#> Embarked 4.385365
auc(actual = xtest$Survived, predicted = pred)
#> [1] 0.7512739