Introduction

mclust is a contributed R package for model-based clustering, classification, and density estimation based on finite normal mixture modelling. It provides functions for parameter estimation via the EM algorithm for normal mixture models with a variety of covariance structures, and functions for simulation from these models. Also included are functions that combine model-based hierarchical clustering, EM for mixture estimation and the Bayesian Information Criterion (BIC) in comprehensive strategies for clustering, density estimation and discriminant analysis. Additional functionalities are available for displaying and visualizing fitted models along with clustering, classification, and density estimation results.

This document gives a quick tour of mclust (version 5.0.2) functionalities. It was written in R Markdown, using the knitr package for production. See help(package="mclust") for further details and references provided by citation("mclust").

library(mclust)
## Package 'mclust' version 5.0.2
## Type 'citation("mclust")' for citing this R package in publications.

Clustering

data(diabetes)
class = diabetes$class
table(class)
## class
## Chemical   Normal    Overt 
##       36       76       33
X = diabetes[,-1]
head(X)
##   glucose insulin sspg
## 1      80     356  124
## 2      97     289  117
## 3     105     319  143
## 4      90     356  199
## 5      90     323  240
## 6      86     381  157
clPairs(X, class)

BIC = mclustBIC(X)
plot(BIC)

summary(BIC)
## Best BIC values:
##              VVV,3       VVE,3       EVE,4
## BIC      -4760.091 -4775.53521 -4793.26174
## BIC diff     0.000   -15.44456   -33.17109
mod1 = Mclust(X)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -2307.883 145 29 -4760.091 -4776.086
## 
## Clustering table:
##  1  2  3 
## 82 33 30 
## 
## Mixing probabilities:
##         1         2         3 
## 0.5618662 0.2233077 0.2148261 
## 
## Means:
##              [,1]     [,2]       [,3]
## glucose  91.41166 105.1978  219.40355
## insulin 358.82811 517.0420 1041.36946
## sspg    166.15629 320.7894   98.33493
## 
## Variances:
## [,,1]
##          glucose    insulin       sspg
## glucose 61.94482   98.79404   35.04002
## insulin 98.79404 2123.14811  387.06112
## sspg    35.04002  387.06112 2681.21187
## [,,2]
##           glucose    insulin       sspg
## glucose  153.2207   795.5581  -494.7416
## insulin  795.5581  6513.7790 -2846.4416
## sspg    -494.7416 -2846.4416 26074.9825
## [,,3]
##           glucose   insulin       sspg
## glucose  6344.602  26160.77  -4433.706
## insulin 26160.768 122003.26 -22714.951
## sspg    -4433.706 -22714.95   5892.777
plot(mod1, what = "classification")

table(class, mod1$classification)
##           
## class       1  2  3
##   Chemical  8 26  2
##   Normal   74  2  0
##   Overt     0  5 28
ICL = mclustICL(X)
summary(ICL)
## Best ICL values:
##              VVV,3      VVE,3       EVE,4
## ICL      -4776.086 -4793.2680 -4809.16854
## ICL diff     0.000   -17.1821   -33.08265
plot(ICL)

LRT = mclustBootstrapLRT(X, modelName = "VVV")
LRT
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model        = VVV 
## Replications = 999 
##                LRTS bootstrap p-value
## 1 vs 2   361.186445             0.001
## 2 vs 3   114.703559             0.001
## 3 vs 4     7.437806             0.944

Classification

EDDA

data(iris)
class = iris$Species
table(class)
## class
##     setosa versicolor  virginica 
##         50         50         50
X = iris[,1:4]
head(X)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
mod2 = MclustDA(X, class, modelType = "EDDA")
summary(mod2)
## ------------------------------------------------
## Gaussian finite mixture model for classification 
## ------------------------------------------------
## 
## EDDA model summary:
## 
##  log.likelihood   n df       BIC
##       -187.7097 150 36 -555.8024
##             
## Classes       n Model G
##   setosa     50   VEV 1
##   versicolor 50   VEV 1
##   virginica  50   VEV 1
## 
## Training classification summary:
## 
##             Predicted
## Class        setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          0        50
## 
## Training error = 0.02
plot(mod2, what = "scatterplot")

plot(mod2, what = "classification")

MclustDA

data(banknote)
class = banknote$Status
table(class)
## class
## counterfeit     genuine 
##         100         100
X = banknote[,-1]
head(X)
##   Length  Left Right Bottom  Top Diagonal
## 1  214.8 131.0 131.1    9.0  9.7    141.0
## 2  214.6 129.7 129.7    8.1  9.5    141.7
## 3  214.8 129.7 129.7    8.7  9.6    142.2
## 4  214.8 129.7 129.6    7.5 10.4    142.0
## 5  215.0 129.6 129.7   10.4  7.7    141.8
## 6  215.7 130.8 130.5    9.0 10.1    141.4
mod3 = MclustDA(X, class)
summary(mod3)
## ------------------------------------------------
## Gaussian finite mixture model for classification 
## ------------------------------------------------
## 
## MclustDA model summary:
## 
##  log.likelihood   n df       BIC
##       -646.0798 200 66 -1641.848
##              
## Classes         n Model G
##   counterfeit 100   EVE 2
##   genuine     100   XXX 1
## 
## Training classification summary:
## 
##              Predicted
## Class         counterfeit genuine
##   counterfeit         100       0
##   genuine               0     100
## 
## Training error = 0
plot(mod3, what = "scatterplot")

plot(mod3, what = "classification")

Cross-validation error

unlist(cvMclustDA(mod2, nfold = 10)[2:3])
##      error         se 
## 0.02666667 0.01474055
unlist(cvMclustDA(mod3, nfold = 10)[2:3])
## error    se 
## 0.005 0.005

Density estimation

Univariate

data(acidity)
mod4 = densityMclust(acidity)
summary(mod4)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling 
## -------------------------------------------------------
## 
## Mclust E (univariate, equal variance) model with 2 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -185.9493 155  4 -392.0723 -398.5554
## 
## Clustering table:
##  1  2 
## 98 57
plot(mod4, what = "BIC")

plot(mod4, what = "density", data = acidity, breaks = 15)

plot(mod4, what = "diagnostic", type = "cdf")

plot(mod4, what = "diagnostic", type = "qq")

Multivariate

data(faithful)
mod5 = densityMclust(faithful)
summary(mod5)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling 
## -------------------------------------------------------
## 
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 3 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -1126.361 272 11 -2314.386 -2360.865
## 
## Clustering table:
##   1   2   3 
## 130  97  45
plot(mod5, what = "BIC")

plot(mod5, what = "density")

plot(mod5, what = "density", type = "image", 
     col = "dodgerblue3", grid = 100)

plot(mod5, what = "density", type = "persp")

Bootstrap inference

boot1 = MclustBootstrap(mod1)
summary(boot1, what = "se")
## ----------------------------------------------------------
## Bootstrap standard errors
## ----------------------------------------------------------
## Model                      = VVV 
## Num. of mixture components = 3 
## Replications               = 999 
## Type                       = nonparametric bootstrap 
## 
## Mixing probabilities:
##          1          2          3 
## 0.05189752 0.04665731 0.03926142 
## 
## Means:
##                1         2        3
## glucose 1.019910  3.569284 17.48050
## insulin 7.015650 24.739314 78.21998
## sspg    6.849971 35.242230 17.99244
## 
## Variances:
## [,,1]
##          glucose  insulin      sspg
## glucose 11.26079  51.4028  53.66672
## insulin 51.40280 480.2080 346.70761
## sspg    53.66672 346.7076 534.22625
## [,,2]
##           glucose   insulin      sspg
## glucose  61.10829  464.0365  498.6189
## insulin 464.03649 3544.7477 3283.7014
## sspg    498.61887 3283.7014 6660.8104
## [,,3]
##          glucose   insulin      sspg
## glucose 1071.998  5922.253  1779.972
## insulin 5922.253 38015.518 11229.004
## sspg    1779.972 11229.004  3282.495
summary(boot1, what = "ci")
## ----------------------------------------------------------
## Bootstrap confidence intervals
## ----------------------------------------------------------
## Model                      = VVV 
## Num. of mixture components = 3 
## Replications               = 999 
## Type                       = nonparametric bootstrap 
## Confidence level           = 0.95 
## 
## Mixing probabilities:
##               1        2         3
## 2.5%  0.4559230 0.144385 0.1419683
## 97.5% 0.6505986 0.328485 0.2932974
## 
## Means:
## [,,1]
##        glucose  insulin     sspg
## 2.5%  89.28173 344.3242 152.0196
## 97.5% 93.29578 371.7835 179.7732
## [,,2]
##         glucose  insulin     sspg
## 2.5%   98.43833 470.4595 257.9772
## 97.5% 113.27674 575.0149 392.4529
## [,,3]
##        glucose   insulin      sspg
## 2.5%  185.9629  887.3448  67.02259
## 97.5% 254.3765 1197.6796 138.06227
## 
## Variances:
## [,,1]
##        glucose  insulin     sspg
## 2.5%  38.92586 1211.564 1645.638
## 97.5% 84.40661 3097.105 3754.882
## [,,2]
##         glucose   insulin     sspg
## 2.5%   64.24949  1992.481 13614.83
## 97.5% 315.06702 16824.828 40150.08
## [,,3]
##        glucose   insulin      sspg
## 2.5%  4104.170  57265.39  1613.676
## 97.5% 8168.312 199044.22 12757.872
boot4 = MclustBootstrap(mod4)
summary(boot4, what = "se")
## ----------------------------------------------------------
## Bootstrap standard errors
## ----------------------------------------------------------
## Model                      = E 
## Num. of mixture components = 2 
## Replications               = 999 
## Type                       = nonparametric bootstrap 
## 
## Mixing probabilities:
##          1          2 
## 0.03974359 0.03974359 
## 
## Means:
##          1          2 
## 0.04561595 0.06819682 
## 
## Variances:
##          1          2 
## 0.02411315 0.02411315
summary(boot4, what = "ci")
## ----------------------------------------------------------
## Bootstrap confidence intervals
## ----------------------------------------------------------
## Model                      = E 
## Num. of mixture components = 2 
## Replications               = 999 
## Type                       = nonparametric bootstrap 
## Confidence level           = 0.95 
## 
## Mixing probabilities:
##               1         2
## 2.5%  0.5465184 0.3032894
## 97.5% 0.6967106 0.4534816
## 
## Means:
##              1        2
## 2.5%  4.280676 6.177506
## 97.5% 4.454121 6.452424
## 
## Variances:
##               1         2
## 2.5%  0.1415196 0.1415196
## 97.5% 0.2382068 0.2382068

Dimension reduction

Clustering

mod1dr = MclustDR(mod1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: Mclust (VVV, 3)
##         
## Clusters  n
##        1 82
##        2 33
##        3 30
## 
## Estimated basis vectors:
##              Dir1     Dir2      Dir3
## glucose -0.986035  0.23503  0.958753
## insulin  0.157678 -0.10980 -0.284026
## sspg    -0.053589 -0.96577 -0.011029
## 
##               Dir1     Dir2      Dir3
## Eigenvalues  1.375  0.77745   0.65837
## Cum. %      48.919 76.57772 100.00000
plot(mod1dr, what = "pairs")

plot(mod1dr, what = "boundaries", ngrid = 200)

mod1dr = MclustDR(mod1, lambda = 1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: Mclust (VVV, 3)
##         
## Clusters  n
##        1 82
##        2 33
##        3 30
## 
## Estimated basis vectors:
##             Dir1     Dir2
## glucose  0.80928  0.92579
## insulin -0.56443 -0.19376
## sspg    -0.16272 -0.32461
## 
##                Dir1      Dir2
## Eigenvalues  1.0589   0.39905
## Cum. %      72.6294 100.00000
plot(mod1dr, what = "scatterplot")

plot(mod1dr, what = "boundaries", ngrid = 200)

Classification

mod2dr = MclustDR(mod2)
summary(mod2dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: EDDA 
##             
## Classes       n Model G
##   setosa     50   VEV 1
##   versicolor 50   VEV 1
##   virginica  50   VEV 1
## 
## Estimated basis vectors:
##                  Dir1      Dir2     Dir3     Dir4
## Sepal.Length  0.17425 -0.193663  0.64081 -0.46231
## Sepal.Width   0.45292  0.066561  0.34852  0.57110
## Petal.Length -0.61629 -0.311030 -0.42366  0.46256
## Petal.Width  -0.62024  0.928076  0.53703 -0.49613
## 
##                 Dir1     Dir2      Dir3       Dir4
## Eigenvalues  0.94747  0.68835  0.076141   0.052607
## Cum. %      53.69408 92.70374 97.018700 100.000000
plot(mod2dr, what = "scatterplot")

plot(mod2dr, what = "boundaries", ngrid = 200)

mod3dr = MclustDR(mod3)
summary(mod3dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: MclustDA 
##              
## Classes         n Model G
##   counterfeit 100   EVE 2
##   genuine     100   XXX 1
## 
## Estimated basis vectors:
##              Dir1     Dir2      Dir3      Dir4       Dir5      Dir6
## Length   -0.10053 -0.32853  0.797138 -0.033156 -0.3177204  0.084867
## Left     -0.21757 -0.30448 -0.303012 -0.893163  0.3688689 -0.565765
## Right     0.29197 -0.01844 -0.495823  0.407884 -0.8616616  0.481501
## Bottom    0.57597  0.44534  0.120202 -0.034503  0.0042677 -0.078642
## Top       0.57549  0.38554  0.100993 -0.103770  0.1360968  0.625145
## Diagonal -0.44088  0.67235 -0.047607 -0.151104 -0.0445899  0.209460
## 
##                 Dir1     Dir2     Dir3     Dir4      Dir5       Dir6
## Eigenvalues  0.87241  0.55366  0.48558  0.13317  0.053192   0.027221
## Cum. %      41.05017 67.10202 89.95013 96.21627 98.719164 100.000000
plot(mod3dr, what = "scatterplot")

plot(mod3dr, what = "boundaries", ngrid = 200)