The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

modelStudio - R & Python examples

Hubert Baniecki

2023-02-20

R & Python Examples

R

The modelStudio() function uses DALEX explainers created with DALEX::explain() or DALEXtra::explain_*().

# packages for the explainer objects
install.packages("DALEX")
install.packages("DALEXtra")

mlr dashboard

In this example, we make a studio for the ranger model on the apartments data.

# load packages and data
library(mlr)
library(DALEXtra)
library(modelStudio)

data <- DALEX::apartments

# split the data
index <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[index,]
test <- data[-index,]

# fit a model
task <- makeRegrTask(id = "apartments", data = train, target = "m2.price")
learner <- makeLearner("regr.ranger", predict.type = "response")
model <- train(learner, task)

# create an explainer for the model
explainer <- explain_mlr(model,
                         data = test,
                         y = test$m2.price,
                         label = "mlr")

# pick observations
new_observation <- test[1:2,]
rownames(new_observation) <- c("id1", "id2")

# make a studio for the model
modelStudio(explainer, new_observation)

mlr3 dashboard

In this example, we make a studio for the ranger model on the titanic data.

# load packages and data
library(mlr3)
library(mlr3learners)
library(DALEXtra)
library(modelStudio)

data <- DALEX::titanic_imputed

# split the data
index <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[index,]
test <- data[-index,]

# mlr3 TaskClassif takes target as factor
train$survived <- as.factor(train$survived)

# fit a model
task <- TaskClassif$new(id = "titanic", backend = train, target = "survived")
learner <- lrn("classif.ranger", predict_type = "prob")
learner$train(task)

# create an explainer for the model
explainer <- explain_mlr3(learner,
                          data = test,
                          y = test$survived,
                          label = "mlr3")

# pick observations
new_observation <- test[1:2,]
rownames(new_observation) <- c("id1", "id2")

# make a studio for the model
modelStudio(explainer, new_observation)

xgboost dashboard

In this example, we make a studio for the xgboost model on the titanic data.

# load packages and data
library(xgboost)
library(DALEX)
library(modelStudio)

data <- DALEX::titanic_imputed

# split the data
index <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[index,]
test <- data[-index,]

train_matrix <- model.matrix(survived ~.-1, train)
test_matrix <- model.matrix(survived ~.-1, test)

# fit a model
xgb_matrix <- xgb.DMatrix(train_matrix, label = train$survived)
params <- list(max_depth = 3, objective = "binary:logistic", eval_metric = "auc")
model <- xgb.train(params, xgb_matrix, nrounds = 500)

# create an explainer for the model
explainer <- explain(model,
                     data = test_matrix,
                     y = test$survived,
                     type = "classification",
                     label = "xgboost")

# pick observations
new_observation <- test_matrix[1:2, , drop=FALSE]
rownames(new_observation) <- c("id1", "id2")

# make a studio for the model
modelStudio(explainer, new_observation)

caret dashboard

In this example, we make a studio for the gbm model on the titanic data.

# load packages and data
library(caret)
library(DALEX)
library(modelStudio)

data <- DALEX::titanic_imputed

# split the data
index <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[index,]
test <- data[-index,]

# caret train takes target as factor
train$survived <- as.factor(train$survived)

# fit a model
cv <- trainControl(method = "repeatedcv", number = 3, repeats = 3)
model <- train(survived ~ ., data = train, method = "gbm", trControl = cv, verbose = FALSE)

# create an explainer for the model
explainer <- explain(model,
                     data = test,
                     y = test$survived,
                     label = "caret")

# pick observations
new_observation <- test[1:2,]
rownames(new_observation) <- c("id1", "id2")

# make a studio for the model
modelStudio(explainer, new_observation)

h2o dashboard

In this example, we make a studio for the h2o.automl model on the titanic data.

# load packages and data
library(h2o)
library(DALEXtra)
library(modelStudio)

data <- DALEX::titanic_imputed

# init h2o
h2o.init()
h2o.no_progress()

# split the data
h2o_split <- h2o.splitFrame(as.h2o(data))
train <- h2o_split[[1]]
test <- as.data.frame(h2o_split[[2]])

# h2o automl takes target as factor
train$survived <- as.factor(train$survived)

# fit a model
automl <- h2o.automl(y = "survived", training_frame = train, max_runtime_secs = 30)
model <- automl@leader

# create an explainer for the model
explainer <- explain_h2o(model,
                         data = test,
                         y = test$survived,
                         label = "h2o")

# pick observations
new_observation <- test[1:2,]
rownames(new_observation) <- c("id1", "id2")

# make a studio for the model
modelStudio(explainer, new_observation,
            B = 5)

# shutdown h2o
h2o.shutdown(prompt = FALSE)

parsnip dashboard

In this example, we make a studio for the ranger model on the apartments data.

# load packages and data
library(parsnip)
library(DALEX)
library(modelStudio)

data <- DALEX::apartments

# split the data
index <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[index,]
test <- data[-index,]

# fit a model
model <- rand_forest() %>%
         set_engine("ranger", importance = "impurity") %>%
         set_mode("regression") %>%
         fit(m2.price ~ ., data = train)

# create an explainer for the model
explainer <- explain(model,
                     data = test,
                     y = test$m2.price,
                     label = "parsnip")

# make a studio for the model
modelStudio(explainer)

tidymodels dashboard

In this example, we make a studio for the ranger model on the titanic data.

# load packages and data
library(tidymodels)
library(DALEXtra)
library(modelStudio)

data <- DALEX::titanic_imputed

# split the data
index <- sample(1:nrow(data), 0.7*nrow(data))
train <- data[index,]
test <- data[-index,]

# tidymodels fit takes target as factor
train$survived <- as.factor(train$survived)

# fit a model
rec <- recipe(survived ~ ., data = train) %>%
       step_normalize(fare)

clf <- rand_forest(mtry = 2) %>%
       set_engine("ranger") %>%
       set_mode("classification")

wflow <- workflow() %>%
         add_recipe(rec) %>%
         add_model(clf)

model <- wflow %>% fit(data = train)

# create an explainer for the model
explainer <- explain_tidymodels(model,
                                data = test,
                                y = test$survived,
                                label = "tidymodels")

# pick observations
new_observation <- test[1:2,]
rownames(new_observation) <- c("id1", "id2")

# make a studio for the model
modelStudio(explainer, new_observation)

Python

The modelStudio() function uses dalex explainers created with dalex.Explainer().

# package for the Explainer object
pip install dalex -U

Use pickle Python module and reticulate R package to easily make a studio for a model.

# package for pickle load
install.packages("reticulate")

scikit-learn dashboard

In this example, we make a studio for the Pipeline SVR model on the fifa data.

First, use dalex in Python:

# load packages and data
import dalex as dx
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from numpy import log

data = dx.datasets.load_fifa()
X = data.drop(columns=['overall', 'potential', 'value_eur', 'wage_eur', 'nationality'], axis=1)
y = log(data.value_eur)

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y)

# fit a pipeline model
model = Pipeline([('scale', StandardScaler()), ('svm', SVR())])
model.fit(X_train, y_train)

# create an explainer for the model
explainer = dx.Explainer(model, data=X_test, y=y_test, label='scikit-learn')

# pack the explainer into a pickle file
explainer.dump(open('explainer_scikitlearn.pickle', 'wb'))

Then, use modelStudio in R:

# load the explainer from the pickle file
library(reticulate)
explainer <- py_load_object("explainer_scikitlearn.pickle", pickle = "pickle")

# make a studio for the model
library(modelStudio)
modelStudio(explainer, B = 5)

lightgbm dashboard

In this example, we make a studio for the Pipeline LGBMClassifier model on the titanic data.

First, use dalex in Python:

# load packages and data
import dalex as dx
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from lightgbm import LGBMClassifier

data = dx.datasets.load_titanic()
X = data.drop(columns='survived')
y = data.survived

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y)

# fit a pipeline model
numerical_features = ['age', 'fare', 'sibsp', 'parch']
numerical_transformer = Pipeline(
  steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
  ]
)
categorical_features = ['gender', 'class', 'embarked']
categorical_transformer = Pipeline(
  steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
  ]
)

preprocessor = ColumnTransformer(
  transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
  ]
)

classifier = LGBMClassifier(n_estimators=300)

model = Pipeline(
  steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
  ]
)
model.fit(X_train, y_train)

# create an explainer for the model
explainer = dx.Explainer(model, data=X_test, y=y_test, label='lightgbm')

# pack the explainer into a pickle file
explainer.dump(open('explainer_lightgbm.pickle', 'wb'))

Then, use modelStudio in R:

# load the explainer from the pickle file
library(reticulate)
explainer <- py_load_object("explainer_lightgbm.pickle", pickle = "pickle")

# make a studio for the model
library(modelStudio)
modelStudio(explainer)

keras/tensorflow dashboard

In this example, we make a studio for the Pipeline KerasClassifier model on the titanic data.

First, use dalex in Python:

# load packages and data
import dalex as dx
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense
from keras.models import Sequential

data = dx.datasets.load_titanic()
X = data.drop(columns='survived')
y = data.survived

# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y)

# fit a pipeline model
numerical_features = ['age', 'fare', 'sibsp', 'parch']
numerical_transformer = Pipeline(
  steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
  ]
)
categorical_features = ['gender', 'class', 'embarked']
categorical_transformer = Pipeline(
  steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
  ]
)

preprocessor = ColumnTransformer(
  transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
  ]
)

def create_architecture():
    model = Sequential()
    # there are 17 inputs after the pipeline
    model.add(Dense(60, input_dim=17, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

classifier = KerasClassifier(build_fn=create_architecture,
                             epochs=100, batch_size=32, verbose=False)

model = Pipeline(
  steps=[
    ('preprocessor', preprocessor),
    ('classifier', classifier)
  ]
)
model.fit(X_train, y_train)

# create an explainer for the model
explainer = dx.Explainer(model, data=X_test, y=y_test, label='keras')

# pack the explainer into a pickle file
explainer.dump(open('explainer_keras.pickle', 'wb'))

Then, use modelStudio in R:

# load the explainer from the pickle file
library(reticulate)

#! add blank create_architecture function before load !
py_run_string('
def create_architecture():
    return True
')

explainer <- py_load_object("explainer_keras.pickle", pickle = "pickle")

# make a studio for the model
library(modelStudio)
modelStudio(explainer)

References

Theoretical introduction to the plots: Explanatory Model Analysis. Explore, Explain, and Examine Predictive Models.
The input object is implemented in DALEX
Feature Importance, Ceteris Paribus, Partial Dependence and Accumulated Dependence explanations are implemented in ingredients
Break Down and Shapley Values explanations are implemented in iBreakDown

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.