The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.
Exploratory and diagnostic machine learning tools for R
The goal of this package is multifold:
install.packages("mltools")
install.packages("devtools")
::install_github("ben519/mltools") devtools
Predict whether or not someone is an alien.
library(data.table)
library(mltools)
# Copy the toy datasets since they are locked from being modified
<- copy(alien.train)
train <- copy(alien.test)
test
train
SkinColor IQScore Cat1 Cat2 Cat3 IsAlien1: green 300 type1 type1 type4 TRUE
2: white 95 type1 type2 type4 FALSE
3: brown 105 type2 type6 type11 FALSE
4: white 250 type4 type5 type2 TRUE
5: blue 115 type2 type7 type11 TRUE
6: white 85 type4 type5 type2 FALSE
7: green 130 type1 type2 type4 TRUE
8: white 115 type1 type1 type4 FALSE
test
SkinColor IQScore Cat1 Cat2 Cat31: white 79 type4 type5 type2
2: green 100 type4 type5 type2
3: brown 125 type3 type9 type7
4: white 90 type1 type8 type4
5: red 115 type1 type2 type4
# Combine train (excluding IsAlien) and test
<- rbind(train[, !"IsAlien", with=FALSE], test)
alien.all
#--------------------------------------------------
## Check for correlated and hierarchical fields
gini_impurities(alien.all, wide=TRUE) # weighted conditional gini impurities
Var1 Cat1 Cat2 Cat3 SkinColor1: Cat1 0.0000000 0.3589744 0.0000000 0.4743590
2: Cat2 0.0000000 0.0000000 0.0000000 0.3461538
3: Cat3 0.0000000 0.3589744 0.0000000 0.4743590
4: SkinColor 0.4102564 0.5384615 0.4102564 0.0000000
# (Cat1, Cat3) = (Cat3, Cat1) = 0 => Cat1 and Cat3 perfectly correspond to each other
# (Cat1, Cat2) > 0 and (Cat2, Cat1) = 0 => Cat1-Cat2 exhibit a parent-child relationship.
# You can guess Cat1 by knowing Cat2, but not vice-versa.
#--------------------------------------------------
## Check relationship between IQScore and IsAlien by binning IQScore into groups
:= bin_data(IQScore, bins=seq(0, 300, by=50))]
train[, BinIQScore
IQScore BinIQScore1: 300 [250, 300]
2: 95 [50, 100)
3: 105 [100, 150)
4: 250 [250, 300]
5: 115 [100, 150)
6: 85 [50, 100)
7: 130 [100, 150)
8: 115 [100, 150)
list(Samples=.N, IQScore=mean(IQScore)), keyby=BinIQScore]
train[,
BinIQScore Samples IQScore1: [50, 100) 2 90.00
2: [100, 150) 4 116.25
3: [250, 300] 2 275.00
# Remove column BinIQScore
:= NULL]
train[, BinIQScore
#--------------------------------------------------
## Check skewness of fields
skewness(alien.all)
$SkinColor
SkinColor Count Pcnt1: white 6 0.46153846
2: green 3 0.23076923
3: brown 2 0.15384615
4: blue 1 0.07692308
5: red 1 0.07692308
$Cat1
Cat1 Count Pcnt1: type1 6 0.46153846
2: type4 4 0.30769231
3: type2 2 0.15384615
4: type3 1 0.07692308
...
set.seed(711)
#--------------------------------------------------
## Set SkinColor as a factor, such that it has the same levels in train and test
## Set low frequency skin colors (1 or fewer occurences) as "_other_"
<- list(train$SkinColor, test$SkinColor)
skincolors <- set_factor(skincolors, aggregationThreshold=1)
skincolors := skincolors[[1]] ] # update train with the new values
train[, SkinColor := skincolors[[2]] ] # update test with the new values
test[, SkinColor
# Repeat the process above for other categorical fields (without setting low freq. values as "_other_")
for(col in c("Cat1", "Cat2", "Cat3")){
<- list(train[[col]], test[[col]])
vals <- set_factor(vals)
vals set(train, j=col, value=vals[[1]])
set(test, j=col, value=vals[[2]])
}
#--------------------------------------------------
## Randomly split the training data into 2 equally sized datasets
# Partition train into two folds, stratified by IsAlien
:= folds(IsAlien, nfolds=2, stratified=TRUE, seed=2016)]
train[, FoldID
<- train[FoldID==1, !"FoldID"]
cvtrain
SkinColor IQScore Cat1 Cat2 Cat3 IsAlien1: green 300 type1 type1 type4 TRUE
2: brown 105 type2 type6 type11 FALSE
3: green 130 type1 type2 type4 TRUE
4: white 115 type1 type1 type4 FALSE
<- train[FoldID==2, !"FoldID"]
cvtest
SkinColor IQScore Cat1 Cat2 Cat3 IsAlien1: white 95 type1 type2 type4 FALSE
2: white 250 type4 type5 type2 TRUE
3: _other_ 115 type2 type7 type11 TRUE
4: white 85 type4 type5 type2 FALSE
#--------------------------------------------------
## Convert cvtrain and cvtest to sparse matrices
## Note that unordered factors are one-hot-encoded
library(Matrix)
<- sparsify(cvtrain)
cvtrain.sparse 4 x 21 sparse Matrix of class "dgCMatrix"
SkinColor__other_ SkinColor_brown SkinColor_green SkinColor_white IQScore Cat1_type1 ...1,] . . 1 . 300 1
[2,] . 1 . . 105 .
[3,] . . 1 . 130 1
[4,] . . . 1 115 1
[
<- sparsify(cvtest)
cvtest.sparse 4 x 21 sparse Matrix of class "dgCMatrix"
SkinColor__other_ SkinColor_brown SkinColor_green SkinColor_white IQScore Cat1_type1 ...1,] . . . 1 95 1
[2,] . . . 1 250 .
[3,] 1 . . . 115 .
[4,] . . . 1 85 . [
#--------------------------------------------------
## Naive model that guesses someone is an alien if their IQScore is > 130
:= ifelse(IQScore > 130, TRUE, FALSE)]
cvtest[, Prediction
#--------------------------------------------------
## Evaluate predictions
# Area Under the ROC Curve (AUC ROC)
auc_roc(preds=cvtest$Prediction, actuals=cvtest$IsAlien)
0.75
# Individual scores to determine which predictions were good/bad (see help(roc_scores) for details)
:= roc_scores(preds=Prediction, actuals=IsAlien)]
cvtest[, ROCScore order(ROCScore)]
cvtest[
SkinColor IQScore Cat1 Cat2 Cat3 IsAlien Prediction ROCScore1: white 95 type1 type2 type4 FALSE FALSE 0.0000000
2: white 250 type4 type5 type2 TRUE TRUE 0.0000000
3: white 85 type4 type5 type2 FALSE FALSE 0.0000000
4: _other_ 115 type2 type7 type11 TRUE FALSE 0.1666667
If you’d like to contact me regarding bugs, questions, or general consulting, feel free to drop me a line - bgorman519@gmail.com
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.