The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

predsplot_examples

Rousseeuw, P.J.

2025-07-14

TopGear data

We start by loading the data, and selecting the relevant features. As pre-processing, we set values of accel==0 to missing because it is impossible. We also remove the incorrect weight of the Peugeot 107.

data(TopGear, package = "robustHD")
cars = TopGear; rm(TopGear)
dim(cars) # [1] 297  32

## [1] 297  32

rownames(cars) = paste(cars[,1],cars[,2])
# Now the rownames are make and model of the cars.
colnames(cars)

##  [1] "Maker"              "Model"              "Type"              
##  [4] "Fuel"               "Price"              "Cylinders"         
##  [7] "Displacement"       "DriveWheel"         "BHP"               
## [10] "Torque"             "Acceleration"       "TopSpeed"          
## [13] "MPG"                "Weight"             "Length"            
## [16] "Width"              "Height"             "AdaptiveHeadlights"
## [19] "AdjustableSteering" "AlarmSystem"        "Automatic"         
## [22] "Bluetooth"          "ClimateControl"     "CruiseControl"     
## [25] "ElectricSeats"      "Leather"            "ParkingSensors"    
## [28] "PowerSteering"      "SatNav"             "ESP"               
## [31] "Verdict"            "Origin"

colnames(cars)[4:17] = c("fuel","price","cyl","displ","drive",
                         "hp","torque","accel","topspeed",
                         "MPG","weight","length","width",
                         "height")
summary(cars$accel) # zero minimum is impossible

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.900   9.100   8.839  11.400  16.900

rownames(cars)[which(cars$accel == 0)]

## [1] "Citroen C5 Tourer" "Ford Mondeo"       "Lotus Elise"      
## [4] "Renault Twizy"     "Ssangyong Rodius"

# [1] "Citroen C5 Tourer" "Ford Mondeo"   "Lotus Elise"
# [4] "Renault Twizy"     "Ssangyong Rodius"
cars$accel[cars$accel == 0] = NA
summary(cars$accel) # OK

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2.50    6.20    9.20    8.99   11.50   16.90       5

cars$weight[order(cars$weight)[1:4]]

## [1] 210 450 490 550

rownames(cars)[order(cars$weight)[1:4]]

## [1] "Peugeot 107"      "Renault Twizy"    "Morgan 3 Wheeler" "Caterham Super 7"

cars$weight[cars$weight == 210] = NA
summary(cars$weight) # OK

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     450    1250    1496    1541    1778    2705      34

# save(cars, file = "Topgear.RData")
# load("Topgear.RData")

Numerical example in the introduction

summary(cars$topspeed) # in mph

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    50.0   112.0   126.0   132.7   151.0   252.0       4

summary(cars$displ)    # in cc

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     647    1560    1995    2504    2988    7993       9

summary(cars$length)   # in mm

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2337    4157    4464    4427    4766    5612      11

cars$length = cars$length/1000 # in meters

fit = lm(hp ~ topspeed + length + displ, data = cars)
summary(fit)

## 
## Call:
## lm(formula = hp ~ topspeed + length + displ, data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -142.034  -20.667   -2.715   15.966  174.020 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.069e+02  2.861e+01  -7.232 4.78e-12 ***
## topspeed     2.466e+00  1.373e-01  17.963  < 2e-16 ***
## length      -1.313e+01  6.187e+00  -2.122   0.0347 *  
## displ        6.255e-02  2.808e-03  22.275  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 37.72 on 274 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.9405, Adjusted R-squared:  0.9399 
## F-statistic:  1445 on 3 and 274 DF,  p-value: < 2.2e-16

predsplot(fit, main = "Top Gear data")

## 
## Prediction terms in the model:
##         prediction term   stdev up/down
##                topspeed  68.380      up
##                  length   5.817    down
##                   displ  91.790      up
##  Total prediction of hp 149.200

fact = 0.51
width = fact*10; height = fact*8
maxnchar = 6
main = "Top Gear data"
pdf(file = "topgear_numerical_predsplot.pdf",
    width = width, height = height)
predsplot(fit, main=main, maxchar.level = maxnchar)

## 
## Prediction terms in the model:
##         prediction term   stdev up/down
##                topspeed  68.380      up
##                  length   5.817    down
##                   displ  91.790      up
##  Total prediction of hp 149.200

dev.off()

## png 
##   2

predscor(fit, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##          topspeed length displ
## topspeed     1.00  -0.45  0.80
## length      -0.45   1.00 -0.56
## displ        0.80  -0.56  1.00

# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "topgear_numerical_predscor.pdf", 
#     width = height, height = height)
# predscor(fit, sort.by.stdev = FALSE)
# dev.off()

Figure 2 and TopGear figures in Supplementary Material:

cars$GPM = 1/cars$MPG

fit4 = lm(GPM ~ accel + drive + weight + fuel, data = cars)
summary(fit4)

## 
## Call:
## lm(formula = GPM ~ accel + drive + weight + fuel, data = cars)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.026997 -0.004301  0.000268  0.003660  0.055643 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.681e-02  4.884e-03   3.443 0.000677 ***
## accel       -1.356e-03  2.459e-04  -5.516 8.81e-08 ***
## driveFront  -2.927e-03  1.619e-03  -1.808 0.071853 .  
## driveRear   -2.745e-04  1.577e-03  -0.174 0.861992    
## weight       1.131e-05  1.841e-06   6.143 3.25e-09 ***
## fuelPetrol   8.427e-03  1.246e-03   6.761 9.98e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007973 on 245 degrees of freedom
##   (46 observations deleted due to missingness)
## Multiple R-squared:  0.6058, Adjusted R-squared:  0.5977 
## F-statistic: 75.29 on 5 and 245 DF,  p-value: < 2.2e-16

## Figure 2:
## 
fact = 0.52
width = fact*10; height = fact*8
maxnchar = 6
main = "Top Gear data"
# pdf(file = "topgear_4_predsplot.pdf",
#     width = width, height = height)
predsplot(fit4, main=main, maxchar.level = maxnchar)

## 
## Prediction terms in the model:
##          prediction term    stdev up/down
##                    accel 0.004329    down
##                    drive 0.001400        
##                   weight 0.004490      up
##                     fuel 0.004104        
##  Total prediction of GPM 0.009783

# dev.off()

Three figures in the Supplementary Material:

## 
# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "topgear_4_predscor.pdf", 
#     width = height, height = height)
predscor(fit4, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##        accel drive weight  fuel
## accel   1.00  0.65   0.49  0.31
## drive   0.65  1.00   0.62  0.08
## weight  0.49  0.62   1.00 -0.23
## fuel    0.31  0.08  -0.23  1.00

# dev.off()

car = "Bentley Continental"
# pdf(file = "topgear_4_predsplot_1_dens.pdf",
#     width = width, height = height)
predsplot(fit4, main = main, casetoshow=car, 
          displaytype = "density", 
          maxchar.level = maxnchar, 
          xlab = paste0("prediction for ",car))

## 
##  Prediction terms for case Bentley Continental:
##          prediction term    value
##                    accel +0.00621
##                    drive +0.00152
##                   weight +0.00857
##                     fuel +0.00322
##                      SUM +0.01952
##               centercept  0.02599
##  Total prediction of GPM  0.04551

# dev.off()

car = "Kia Rio"
# pdf(file = "topgear_4_predsplot_2_stairs.pdf",
#     width = width, height = height)
predsplot(fit4, main = main, casetoshow=car, 
          staircase = TRUE, maxchar.level = maxnchar, 
          xlab = paste0("prediction for ",car))

## 
##  Prediction terms for case Kia Rio:
##          prediction term    value
##                    accel -0.00884
##                    drive -0.00141
##                   weight -0.00420
##                     fuel -0.00520
##                      SUM -0.01966
##               centercept  0.02599
##  Total prediction of GPM  0.00633

# dev.off()

Test combination of expressions and types

We add a logical variable (AlarmSystem) as well as a character variable (SatNav)

summary(cars$AlarmSystem)

## no/optional    standard 
##         112         185

cars$alarm = (cars$AlarmSystem == "standard")
summary(cars$alarm)

##    Mode   FALSE    TRUE 
## logical     112     185

summary(cars$SatNav)

##       no optional standard 
##       86      116       95

cars$navig = "y"
cars$navig[cars$SatNav == "no"] = "n"
summary(cars$navig)

##    Length     Class      Mode 
##       297 character character

table(cars$navig)

## 
##   n   y 
##  86 211

class(cars$navig)

## [1] "character"

str(cars)

## 'data.frame':    297 obs. of  35 variables:
##  $ Maker             : Factor w/ 51 levels "Alfa Romeo","Aston Martin",..: 1 1 2 2 2 2 2 2 2 3 ...
##  $ Model             : Factor w/ 296 levels "1 Series","1 Series Convertible",..: 139 186 101 102 103 262 266 267 268 29 ...
##  $ Type              : Factor w/ 297 levels "107 1.0 68 Active 5d",..: 139 190 100 101 102 263 269 268 267 30 ...
##  $ fuel              : Factor w/ 2 levels "Diesel","Petrol": 1 2 2 2 2 2 2 2 2 2 ...
##  $ price             : num  21250 15155 30995 131995 141995 ...
##  $ cyl               : num  4 4 4 12 12 12 12 8 8 4 ...
##  $ displ             : num  1598 1368 1329 5935 5935 ...
##  $ drive             : Factor w/ 3 levels "4WD","Front",..: 2 2 2 3 3 3 3 3 3 2 ...
##  $ hp                : num  105 105 98 517 517 510 573 420 420 86 ...
##  $ torque            : num  236 95 92 457 457 420 457 346 346 118 ...
##  $ accel             : num  11.3 10.7 11.8 4.6 4.6 4.2 4.1 4.7 4.7 11.7 ...
##  $ topspeed          : num  115 116 106 183 183 190 183 180 180 112 ...
##  $ MPG               : num  64 49 56 19 19 17 19 20 20 55 ...
##  $ weight            : num  1385 1090 988 1785 1890 ...
##  $ length            : num  4.35 4.06 3.08 4.72 4.72 ...
##  $ width             : num  1798 1720 1680 NA NA ...
##  $ height            : num  1465 1446 1500 1282 1282 ...
##  $ AdaptiveHeadlights: Factor w/ 3 levels "no","optional",..: 2 2 1 3 3 1 3 1 1 2 ...
##  $ AdjustableSteering: Factor w/ 2 levels "no","standard": 2 2 2 2 2 2 2 2 2 2 ...
##  $ AlarmSystem       : Factor w/ 2 levels "no/optional",..: 2 2 1 1 1 2 2 2 2 2 ...
##  $ Automatic         : Factor w/ 3 levels "no","optional",..: 1 1 2 3 3 1 3 2 2 1 ...
##  $ Bluetooth         : Factor w/ 3 levels "no","optional",..: 3 3 3 3 3 3 3 2 2 3 ...
##  $ ClimateControl    : Factor w/ 3 levels "no","optional",..: 3 2 3 3 3 3 3 3 3 2 ...
##  $ CruiseControl     : Factor w/ 3 levels "no","optional",..: 3 3 3 3 3 3 3 2 2 2 ...
##  $ ElectricSeats     : Factor w/ 3 levels "no","optional",..: 2 1 1 3 3 3 3 3 3 1 ...
##  $ Leather           : Factor w/ 3 levels "no","optional",..: 2 2 1 3 3 3 3 3 3 2 ...
##  $ ParkingSensors    : Factor w/ 3 levels "no","optional",..: 2 3 1 3 3 3 3 3 3 2 ...
##  $ PowerSteering     : Factor w/ 2 levels "no","standard": 2 2 2 2 2 2 2 2 2 2 ...
##  $ SatNav            : Factor w/ 3 levels "no","optional",..: 2 2 3 3 3 3 3 2 2 2 ...
##  $ ESP               : Factor w/ 3 levels "no","optional",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Verdict           : num  6 5 7 7 7 7 7 8 7 6 ...
##  $ Origin            : Factor w/ 3 levels "Asia","Europe",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ GPM               : num  0.0156 0.0204 0.0179 0.0526 0.0526 ...
##  $ alarm             : logi  TRUE TRUE FALSE FALSE FALSE TRUE ...
##  $ navig             : chr  "y" "y" "y" "y" ...

fitcombin = lm(1/MPG ~ accel + log(weight) + accel:torque + alarm + navig, data = cars)
summary(fitcombin)

## 
## Call:
## lm(formula = 1/MPG ~ accel + log(weight) + accel:torque + alarm + 
##     navig, data = cars)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.026131 -0.005201 -0.001251  0.004847  0.056823 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.018e-01  3.347e-02  -3.040  0.00262 ** 
## accel        -2.115e-03  2.461e-04  -8.591 9.87e-16 ***
## log(weight)   2.135e-02  4.709e-03   4.534 9.01e-06 ***
## alarmTRUE    -2.450e-03  1.343e-03  -1.824  0.06932 .  
## navigy       -1.693e-03  1.367e-03  -1.239  0.21667    
## accel:torque -3.403e-06  1.394e-06  -2.442  0.01532 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.008684 on 247 degrees of freedom
##   (44 observations deleted due to missingness)
## Multiple R-squared:  0.5293, Adjusted R-squared:  0.5198 
## F-statistic: 55.56 on 5 and 247 DF,  p-value: < 2.2e-16

fact = 0.45
width = fact*10; height = fact*8
main = "Example with transformation, interaction, logical, character"
# pdf(file = "topgear_logical+char_predsplot.pdf", width = width, height = height)
predsplot(fitcombin, main=main)

## 
## Prediction terms in the model:
##            prediction term     stdev up/down
##                      accel 0.0067300    down
##                log(weight) 0.0054000      up
##                      alarm 0.0011630        
##                      navig 0.0007587        
##               accel:torque 0.0024290        
##  Total prediction of 1/MPG 0.0091180

# dev.off()  

fact = 0.6
width = fact*10; height = fact*8
# pdf(file = "topgear_logical+char_predscor.pdf",
#     width = height, height = height)
predscor(fitcombin)

## Correlation matrix of the prediction terms:
##              accel log(weight) accel:torque alarm navig
## accel         1.00        0.53        -0.09 -0.37 -0.33
## log(weight)   0.53        1.00        -0.75 -0.43 -0.35
## accel:torque -0.09       -0.75         1.00  0.26  0.18
## alarm        -0.37       -0.43         0.26  1.00  0.38
## navig        -0.33       -0.35         0.18  0.38  1.00

# dev.off()

Titanic data

We start by loading the data, turning the pclass and sex variables into factors.

data(data_titanic, package = "classmap")
titanic = data_titanic; rm(data_titanic)
dim(titanic) # 1309  13

## [1] 1309   13

# A data frame with 1309 observations on the following variables.
#   passengerId: a unique identifier for each passenger.
#   pclass: travel class of the passenger.
#   name: name of the passenger.
#   sex: sex of the passenger.
#   age: age of the passenger.
#   sibsp: number of siblings and spouses traveling with the passenger.
#   parch: number of parents and children traveling with the passenger.
#   ticket: ticket number of the passenger.
#   fare: fare paid for the ticket.
#   Cabin: cabin number of the passenger.
#   embarked: Port of embarkation. Takes the values
#             C (Cherbourg), Q (Queenstown) and
#             S (Southampton).
#   y: factor indicating casualty or survivor.
#   dataType: vector taking the values “train” or “test”
#             indicating whether the observation belongs
#             to training or test data.
#
colnames(titanic) = c("passengerId","pclass","name","sex",
                   "age", "sibsp", "parch", "ticket",
                   "fare", "cabin", "embarked", "survival",
                   "dataType")
str(titanic)

## 'data.frame':    1309 obs. of  13 variables:
##  $ passengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ sex        : chr  "male" "female" "female" "female" ...
##  $ age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ sibsp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ cabin      : chr  "" "C85" "" "C123" ...
##  $ embarked   : chr  "S" "C" "S" "S" ...
##  $ survival   : Factor w/ 2 levels "casualty","survived": 1 2 2 2 1 1 1 1 2 2 ...
##  $ dataType   : chr  "train" "train" "train" "train" ...

unique(titanic$pclass) # 3 1 2

## [1] 3 1 2

titanic$pclass = factor(titanic$pclass, levels = unique(titanic$pclass))
#

unique(titanic$sex)

## [1] "male"   "female"

titanic$sex = factor(titanic$sex, levels = unique(titanic$sex) )
titanic$sex = factor(titanic$sex, labels = c("M","F"))
head(titanic)

##   passengerId pclass                                                name sex
## 1           1      3                             Braund, Mr. Owen Harris   M
## 2           2      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer)   F
## 3           3      3                              Heikkinen, Miss. Laina   F
## 4           4      1        Futrelle, Mrs. Jacques Heath (Lily May Peel)   F
## 5           5      3                            Allen, Mr. William Henry   M
## 6           6      3                                    Moran, Mr. James   M
##   age sibsp parch           ticket    fare cabin embarked survival dataType
## 1  22     1     0        A/5 21171  7.2500              S casualty    train
## 2  38     1     0         PC 17599 71.2833   C85        C survived    train
## 3  26     0     0 STON/O2. 3101282  7.9250              S survived    train
## 4  35     1     0           113803 53.1000  C123        S survived    train
## 5  35     0     0           373450  8.0500              S casualty    train
## 6  NA     0     0           330877  8.4583              Q casualty    train

# save(titanic, file="Titanic.RData")
# load("Titanic.RData")

Now fit a logistic regression model.

fit5 <- glm(survival ~ sex + age + sibsp + parch + pclass,
            family=binomial, data = titanic)
summary(fit5)

## 
## Call:
## glm(formula = survival ~ sex + age + sibsp + parch + pclass, 
##     family = binomial, data = titanic)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.002093   0.221287  -4.528 5.94e-06 ***
## sexF         2.556861   0.173270  14.756  < 2e-16 ***
## age         -0.039489   0.006634  -5.952 2.64e-09 ***
## sibsp       -0.352914   0.105349  -3.350 0.000808 ***
## parch        0.074361   0.099907   0.744 0.456697    
## pclass1      2.352022   0.228803  10.280  < 2e-16 ***
## pclass2      0.985265   0.199384   4.942 7.75e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1414.62  on 1045  degrees of freedom
## Residual deviance:  970.12  on 1039  degrees of freedom
##   (263 observations deleted due to missingness)
## AIC: 984.12
## 
## Number of Fisher Scoring iterations: 4

Figure 3:

fact = 0.5
width = fact*10; height = fact*8
main = "Titanic data"
# pdf(file = "titanic_5_predsplot_dens.pdf",
#     width = width, height = height)
predsplot(fit5, main=main, displaytype = "density")

## 
## Prediction terms in the model:
##                      prediction term   stdev up/down
##                                  sex 1.23600        
##                                  age 0.56920    down
##                                sibsp 0.32190    down
##                                parch 0.06244      up
##                               pclass 0.98130        
##  Total linear prediction of survival 1.67000        
## 
## The glm fit used the logit link function.

# The glm fit used the logit link function.
# dev.off()

# With other options for density estimation:
predsplot(fit5, main=main, displaytype = "density",
          bw = "SJ", adjust = 0.5)

## 
## Prediction terms in the model:
##                      prediction term   stdev up/down
##                                  sex 1.23600        
##                                  age 0.56920    down
##                                sibsp 0.32190    down
##                                parch 0.06244      up
##                               pclass 0.98130        
##  Total linear prediction of survival 1.67000        
## 
## The glm fit used the logit link function.

Three figures in Supplementary Material:

# pdf(file = "titanic_5_predsplot_1.pdf",
#     width = width, height = height)
predsplot(fit5, main = main, casetoshow=1)

## 
##  Prediction terms for case 1:
##                                 prediction term    value
##                                             sex -0.94843
##                                             age +0.31122
##                                           sibsp -0.17544
##                                           parch -0.03128
##                                          pclass -0.88444
##                                             SUM -1.72839
##                                      centercept -0.49537
##             Total linear prediction of survival -2.22376
##  Total prediction of survival in response units  0.09764
## 
## The glm fit used the logit link function.

# dev.off()

# pdf(file = "titanic_5_predsplot_2_stairs.pdf",
#     width = width, height = height)
predsplot(fit5, main = main, casetoshow=2, staircase = TRUE)

## 
##  Prediction terms for case 2:
##                                 prediction term    value
##                                             sex +1.60843
##                                             age -0.32060
##                                           sibsp -0.17544
##                                           parch -0.03128
##                                          pclass +1.46758
##                                             SUM +2.54868
##                                      centercept -0.49537
##             Total linear prediction of survival  2.05331
##  Total prediction of survival in response units  0.88628
## 
## The glm fit used the logit link function.

# dev.off()

# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "titanic_5_predscor.pdf", 
#     width = height, height = height)
predscor(fit5, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##          sex   age sibsp parch pclass
## sex     1.00  0.06 -0.10  0.22   0.14
## age     0.06  1.00 -0.24  0.15  -0.41
## sibsp  -0.10 -0.24  1.00 -0.37   0.04
## parch   0.22  0.15 -0.37  1.00  -0.02
## pclass  0.14 -0.41  0.04 -0.02   1.00

# dev.off()

German credit data

data(german.credit, package = "fairml")
credit = german.credit; rm(german.credit)
dim(credit) # 1000 21

## [1] 1000   21

str(credit)

## 'data.frame':    1000 obs. of  21 variables:
##  $ Account_status          : Factor w/ 4 levels "< 0 DM",">= 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ Duration                : num  6 48 12 42 24 36 24 36 12 30 ...
##  $ Credit_history          : Factor w/ 5 levels "all credits at this bank paid back duly",..: 2 4 2 4 3 4 4 4 4 2 ...
##  $ Purpose                 : Factor w/ 10 levels "business","car (new)",..: 8 8 5 6 2 5 6 3 8 2 ...
##  $ Credit_amount           : num  1169 5951 2096 7882 4870 ...
##  $ Savings_bonds           : Factor w/ 5 levels "< 100 DM",">= 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ Present_employment_since: Factor w/ 5 levels "< 1 year",">= 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
##  $ Installment_rate        : num  4 2 2 2 3 2 3 2 2 4 ...
##  $ Other_debtors_guarantors: Factor w/ 3 levels "co-applicant",..: 3 3 3 2 3 3 3 3 3 3 ...
##  $ Resident_since          : num  4 2 3 4 4 4 4 2 4 2 ...
##  $ Property                : Factor w/ 4 levels "building society savings agreement / life insurance",..: 3 3 3 1 4 4 1 2 3 2 ...
##  $ Age                     : num  67 22 49 45 53 35 53 35 61 28 ...
##  $ Other_installment_plans : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Housing                 : Factor w/ 3 levels "rent","own","for free": 2 2 2 3 3 3 2 1 2 2 ...
##  $ Existing_credits        : num  2 1 1 1 2 1 1 1 1 2 ...
##  $ Job                     : Factor w/ 4 levels "management / self-employed / highly qualified employee / officer",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ People_maintenance_for  : num  1 1 2 2 2 2 1 1 1 1 ...
##  $ Telephone               : Factor w/ 2 levels "none","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ Foreign_worker          : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Credit_risk             : Factor w/ 2 levels "BAD","GOOD": 2 1 2 2 1 2 2 2 2 1 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 1 1 1 1 1 1 1 1 ...

#
# Shorten variable names for plotting:
colnames(credit) <- c("checking","months","history","purpose","amount","savings","employment","rate","guarantors","residence","property","age","inst_plans","housing","nloans","job","nclients","phone","foreign","credit","sex")
#
# Give factors sex and purpose shorter labels for plotting:
credit$sex = factor(as.numeric(credit$sex), labels = c("F","M"))
levels(credit$purpose)

##  [1] "business"              "car (new)"             "car (used)"           
##  [4] "domestic appliances"   "education"             "furniture / equipment"
##  [7] "others"                "radio / television"    "repairs"              
## [10] "retrainin"

# [10] "retrainin"
levels(credit$purpose) = c("business","n.car","u.car","appliances","education","furniture","other","TV","repairs","retraining")
# save(credit, file = "German_credit.RData")
# load("German_credit.RData")

Now fit a logistic regression model

fit7 <- glm(credit ~ months + purpose + amount + rate + age +
            nclients + sex, family=binomial, data = credit)

Figure in Supplementary Material:

main = "German credit data"
# fact = 0.48
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predsplot.pdf",
#     width = width, height = height)
predsplot(fit7, main = main)

## 
## Prediction terms in the model:
##                    prediction term   stdev up/down
##                             months 0.37690    down
##                            purpose 0.51700        
##                             amount 0.26380    down
##                               rate 0.27320    down
##                                age 0.24350      up
##                           nclients 0.07078    down
##                                sex 0.21110        
##  Total linear prediction of credit 0.80900        
## 
## The glm fit used the logit link function.

# dev.off()

Figure 4:

# fact = 0.48
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predsplot_1.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow=1, 
          displaytype = "density")

## 
##  Prediction terms for case 1:
##                               prediction term    value
##                                        months +0.46584
##                                       purpose +0.37611
##                                        amount +0.19645
##                                          rate -0.25082
##                                           age +0.67325
##                                      nclients +0.03030
##                                           sex +0.14143
##                                           SUM +1.63255
##                                    centercept  0.95998
##             Total linear prediction of credit  2.59253
##  Total prediction of credit in response units  0.93038
## 
## The glm fit used the logit link function.

# dev.off()

Figure 5:

# pdf(file = "germancredit_7_predsplot_2_stairs.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow=2, staircase = TRUE)

## 
##  Prediction terms for case 2:
##                               prediction term    value
##                                        months -0.84700
##                                       purpose +0.37611
##                                        amount -0.25041
##                                          rate +0.23763
##                                           age -0.28994
##                                      nclients +0.03030
##                                           sex -0.31479
##                                           SUM -1.05811
##                                    centercept  0.95998
##             Total linear prediction of credit -0.09813
##  Total prediction of credit in response units  0.47549
## 
## The glm fit used the logit link function.

# dev.off()

Figure 6:

# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predscor_equalsizes.pdf", 
#     width = height, height = height)
predscor(fit7, sort.by.stdev = FALSE, cell.length = "equal")

## Correlation matrix of the prediction terms:
##          months purpose amount  rate   age nclients   sex
## months     1.00   -0.12   0.62  0.07  0.04    -0.02 -0.08
## purpose   -0.12    1.00  -0.12 -0.01 -0.03     0.04  0.05
## amount     0.62   -0.12   1.00 -0.27 -0.03     0.02 -0.09
## rate       0.07   -0.01  -0.27  1.00 -0.06    -0.07 -0.09
## age        0.04   -0.03  -0.03 -0.06  1.00    -0.12  0.16
## nclients  -0.02    0.04   0.02 -0.07 -0.12     1.00 -0.20
## sex       -0.08    0.05  -0.09 -0.09  0.16    -0.20  1.00

# dev.off()
#
# pdf(file = "germancredit_7_predscor.pdf", 
#     width = height, height = height)
predscor(fit7, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##          months purpose amount  rate   age nclients   sex
## months     1.00   -0.12   0.62  0.07  0.04    -0.02 -0.08
## purpose   -0.12    1.00  -0.12 -0.01 -0.03     0.04  0.05
## amount     0.62   -0.12   1.00 -0.27 -0.03     0.02 -0.09
## rate       0.07   -0.01  -0.27  1.00 -0.06    -0.07 -0.09
## age        0.04   -0.03  -0.03 -0.06  1.00    -0.12  0.16
## nclients  -0.02    0.04   0.02 -0.07 -0.12     1.00 -0.20
## sex       -0.08    0.05  -0.09 -0.09  0.16    -0.20  1.00

# dev.off()

Two figures in Supplementary Material:

## Prediction for a new case:
newc = list("u.car", 36, 2, 6000, 55, "F", 1)
names(newc) = c("purpose","months","rate","amount",
                "age","sex","nclients")
# fact = 0.48
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_predsplot_new_stairs.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow = newc, 
          staircase = TRUE)

## 
##  The case to show is: 
## $purpose
## [1] "u.car"
## 
## $months
## [1] 36
## 
## $rate
## [1] 2
## 
## $amount
## [1] 6000
## 
## $age
## [1] 55
## 
## $sex
## [1] "F"
## 
## $nclients
## [1] 1
## 
## 
##  Prediction terms for the new case:
##                               prediction term    value
##                                        months -0.47190
##                                       purpose +1.02816
##                                        amount -0.25499
##                                          rate +0.23763
##                                           age +0.41640
##                                      nclients +0.03030
##                                           sex +0.14143
##                                           SUM +1.12701
##                                    centercept  0.95998
##             Total linear prediction of credit  2.08699
##  Total prediction of credit in response units  0.88963
## 
## The glm fit used the logit link function.

# dev.off()

## Figure with profile:
# pdf(file = "germancredit_7_predsplot_1_dens_profile.pdf",
#     width = width, height = height)
predsplot(fit7, main = main, casetoshow=1, 
               displaytype = "density", profile = TRUE)

## 
##  Prediction terms for case 1:
##                               prediction term    value
##                                        months +0.46584
##                                       purpose +0.37611
##                                        amount +0.19645
##                                          rate -0.25082
##                                           age +0.67325
##                                      nclients +0.03030
##                                           sex +0.14143
##                                           SUM +1.63255
##                                    centercept  0.95998
##             Total linear prediction of credit  2.59253
##  Total prediction of credit in response units  0.93038
## 
## The glm fit used the logit link function.

# dev.off()

Artificial example to illustrate high correlation:

credit$x1 = credit$months + credit$nclients
credit$x2 = credit$months - credit$nclients
cor(credit$x1,credit$x2) # 0.998199

## [1] 0.9981994

# But minus that for prediction terms!

fitart <- glm(credit ~ x1 + x2 + purpose + amount + rate + age +
           sex, family=binomial, data = credit)

## Figure 7:
## 
# fact = 0.48
# width = fact*10; height = fact*8
# main = "German credit data with artificial variables x1 and x2"
# pdf(file = "germancredit_7_artif_predsplot.pdf", 
#     width = width, height = height)
predsplot(fitart, main = main)

## 
## Prediction terms in the model:
##                    prediction term  stdev up/down
##                                 x1 1.3670    down
##                                 x2 0.9913      up
##                            purpose 0.5170        
##                             amount 0.2638    down
##                               rate 0.2732    down
##                                age 0.2435      up
##                                sex 0.2111        
##  Total linear prediction of credit 0.8090        
## 
## The glm fit used the logit link function.

# dev.off()

## Figure 8:
## 
# fact = 0.6
# width = fact*10; height = fact*8
# pdf(file = "germancredit_7_artif_predscor.pdf", 
#     width = height, height = height)
predscor(fitart, sort.by.stdev = FALSE)

## Correlation matrix of the prediction terms:
##            x1    x2 purpose amount  rate   age   sex
## x1       1.00 -1.00   -0.12   0.63  0.07  0.03 -0.09
## x2      -1.00  1.00    0.12  -0.62 -0.08 -0.04  0.08
## purpose -0.12  0.12    1.00  -0.12 -0.01 -0.03  0.05
## amount   0.63 -0.62   -0.12   1.00 -0.27 -0.03 -0.09
## rate     0.07 -0.08   -0.01  -0.27  1.00 -0.06 -0.09
## age      0.03 -0.04   -0.03  -0.03 -0.06  1.00  0.16
## sex     -0.09  0.08    0.05  -0.09 -0.09  0.16  1.00

# dev.off()

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.