Introduction

mclust is a contributed R package for model-based clustering, classification, and density estimation based on finite normal mixture modelling. It provides functions for parameter estimation via the EM algorithm for normal mixture models with a variety of covariance structures, and functions for simulation from these models. Also included are functions that combine model-based hierarchical clustering, EM for mixture estimation and the Bayesian Information Criterion (BIC) in comprehensive strategies for clustering, density estimation and discriminant analysis. Additional functionalities are available for displaying and visualizing fitted models along with clustering, classification, and density estimation results.

This document gives a quick tour of mclust functionalities. It was written in R Markdown, using the knitr package for production. It corresponds to mclust version 5.0.0. See the help pages for further details and references provided by citation("mclust").

library(mclust)
## Package 'mclust' version 5.0.0
## Type 'citation("mclust")' for citing this R package in publications.

Clustering

data(diabetes)
class = diabetes$class
table(class)
## class
## Chemical   Normal    Overt 
##       36       76       33
X = diabetes[,-1]
head(X)
##   glucose insulin sspg
## 1      80     356  124
## 2      97     289  117
## 3     105     319  143
## 4      90     356  199
## 5      90     323  240
## 6      86     381  157
clPairs(X, class)

BIC = mclustBIC(X)
plot(BIC)

summary(BIC)
## Best BIC values:
##              VVV,3       VVE,3       EVE,4
## BIC      -4770.044 -4785.48868 -4803.21521
## BIC diff     0.000   -15.44456   -33.17109
mod1 = Mclust(X)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm 
## ----------------------------------------------------
## 
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -2307.883 145 29 -4760.091 -4776.086
## 
## Clustering table:
##  1  2  3 
## 82 33 30 
## 
## Mixing probabilities:
##         1         2         3 
## 0.5618662 0.2233077 0.2148261 
## 
## Means:
##              [,1]     [,2]       [,3]
## glucose  91.41166 105.1978  219.40355
## insulin 358.82811 517.0420 1041.36946
## sspg    166.15629 320.7894   98.33493
## 
## Variances:
## [,,1]
##          glucose    insulin       sspg
## glucose 61.94482   98.79404   35.04002
## insulin 98.79404 2123.14811  387.06112
## sspg    35.04002  387.06112 2681.21187
## [,,2]
##           glucose    insulin       sspg
## glucose  153.2207   795.5581  -494.7416
## insulin  795.5581  6513.7790 -2846.4416
## sspg    -494.7416 -2846.4416 26074.9825
## [,,3]
##           glucose   insulin       sspg
## glucose  6344.602  26160.77  -4433.706
## insulin 26160.768 122003.26 -22714.951
## sspg    -4433.706 -22714.95   5892.777
plot(mod1, what = "classification")

table(class, mod1$classification)
##           
## class       1  2  3
##   Chemical  8 26  2
##   Normal   74  2  0
##   Overt     0  5 28
ICL = mclustICL(X)
summary(ICL)
## Best ICL values:
##              VVV,3      VVE,3       EVE,4
## ICL      -4776.086 -4793.2680 -4809.16854
## ICL diff     0.000   -17.1821   -33.08265
plot(ICL)

LRT = mclustBootstrapLRT(X, modelName = "VVV")
LRT
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model        = VVV 
## Replications = 999 
##                LRTS bootstrap p-value
## 1 vs 2   361.186445             0.001
## 2 vs 3   114.703559             0.001
## 3 vs 4     7.437806             0.939

Classification

EDDA

data(iris)
class = iris$Species
table(class)
## class
##     setosa versicolor  virginica 
##         50         50         50
X = iris[,1:4]
head(X)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
mod2 = MclustDA(X, class, modelType = "EDDA")
summary(mod2)
## ------------------------------------------------
## Gaussian finite mixture model for classification 
## ------------------------------------------------
## 
## EDDA model summary:
## 
##  log.likelihood   n df       BIC
##       -187.7097 150 38 -565.8236
##             
## Classes       n Model G
##   setosa     50   VEV 1
##   versicolor 50   VEV 1
##   virginica  50   VEV 1
## 
## Training classification summary:
## 
##             Predicted
## Class        setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          0        50
## 
## Training error = 0.02
plot(mod2, what = "scatterplot")

plot(mod2, what = "classification")

MclustDA

data(banknote)
class = banknote$Status
table(class)
## class
## counterfeit     genuine 
##         100         100
X = banknote[,-1]
head(X)
##   Length  Left Right Bottom  Top Diagonal
## 1  214.8 131.0 131.1    9.0  9.7    141.0
## 2  214.6 129.7 129.7    8.1  9.5    141.7
## 3  214.8 129.7 129.7    8.7  9.6    142.2
## 4  214.8 129.7 129.6    7.5 10.4    142.0
## 5  215.0 129.6 129.7   10.4  7.7    141.8
## 6  215.7 130.8 130.5    9.0 10.1    141.4
mod3 = MclustDA(X, class)
summary(mod3)
## ------------------------------------------------
## Gaussian finite mixture model for classification 
## ------------------------------------------------
## 
## MclustDA model summary:
## 
##  log.likelihood   n df       BIC
##       -646.0798 200 66 -1641.848
##              
## Classes         n Model G
##   counterfeit 100   EVE 2
##   genuine     100   XXX 1
## 
## Training classification summary:
## 
##              Predicted
## Class         counterfeit genuine
##   counterfeit         100       0
##   genuine               0     100
## 
## Training error = 0
plot(mod3, what = "scatterplot")

plot(mod3, what = "classification")

Cross-validation error

unlist(cvMclustDA(mod2, nfold = 10)[2:3])
##      error         se 
## 0.02666667 0.01088662
unlist(cvMclustDA(mod3, nfold = 10)[2:3])
## error    se 
## 0.005 0.005

Density estimation

Univariate

data(acidity)
mod4 = densityMclust(acidity)
summary(mod4)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling 
## -------------------------------------------------------
## 
## Mclust E (univariate, equal variance) model with 2 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -185.9493 155  4 -392.0723 -398.5554
## 
## Clustering table:
##  1  2 
## 98 57
plot(mod4, what = "BIC")

plot(mod4, what = "density", data = acidity, breaks = 15)

plot(mod4, what = "diagnostic", type = "cdf")

plot(mod4, what = "diagnostic", type = "qq")

Multivariate

data(faithful)
mod5 = densityMclust(faithful)
summary(mod5)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling 
## -------------------------------------------------------
## 
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 3 components:
## 
##  log.likelihood   n df       BIC       ICL
##       -1126.361 272 11 -2314.386 -2360.865
## 
## Clustering table:
##   1   2   3 
## 130  97  45
plot(mod5, what = "BIC")

plot(mod5, what = "density")

plot(mod5, what = "density", type = "image", 
     col = "dodgerblue3", grid = 100)

plot(mod5, what = "density", type = "persp")

Bootstrap inference

boot1 = MclustBootstrap(mod1)
summary(boot1, what = "se")
## Bootstrap standard errors
## ----------------------------------------
## Model                        = VVV 
## Num. of mixture components   = 3 
## Replications                 = 999 
## 
## Mixing probabilities:
##          1          2          3 
## 0.05016287 0.04563044 0.03844920 
## 
## Means:
##                1        2        3
## glucose 1.048604  3.72232 17.52620
## insulin 7.436813 25.51307 75.88028
## sspg    7.253418 33.91974 17.05377
## 
## Variances:
## [,,1]
##          glucose   insulin      sspg
## glucose 11.18436  51.67467  52.06727
## insulin 51.67467 492.85449 364.44729
## sspg    52.06727 364.44729 560.23662
## [,,2]
##           glucose   insulin      sspg
## glucose  62.27318  467.1592  491.5765
## insulin 467.15918 3561.9528 3293.8502
## sspg    491.57655 3293.8502 7135.8855
## [,,3]
##          glucose   insulin      sspg
## glucose 1123.923  5964.213  1761.580
## insulin 5964.213 37241.661 10919.481
## sspg    1761.580 10919.481  3156.073
summary(boot1, what = "ci")
## Bootstrap confidence intervals
## ----------------------------------------
## Model                        = VVV 
## Num. of mixture components   = 3 
## Replications                 = 999 
## Confidence level             = 0.95 
## 
## Mixing probabilities:
##               1         2         3
## 2.5%  0.4644103 0.1434245 0.1401601
## 97.5% 0.6519155 0.3232752 0.2925669
## 
## Means:
## [,,1]
##        glucose  insulin     sspg
## 2.5%  89.33215 343.5801 151.9062
## 97.5% 93.36431 373.2421 180.1879
## [,,2]
##        glucose  insulin     sspg
## 2.5%   98.7532 472.1172 255.3913
## 97.5% 113.9337 578.6053 390.7337
## [,,3]
##        glucose   insulin      sspg
## 2.5%  189.4671  900.8685  68.43588
## 97.5% 255.2770 1191.9511 133.25715
## 
## Variances:
## [,,1]
##        glucose  insulin     sspg
## 2.5%  40.15036 1218.370 1637.185
## 97.5% 82.28988 3027.744 3912.103
## [,,2]
##         glucose   insulin     sspg
## 2.5%   63.46492  2020.641 12675.49
## 97.5% 330.39902 16497.320 40323.10
## [,,3]
##        glucose   insulin      sspg
## 2.5%  3988.261  56457.91  1542.744
## 97.5% 8358.278 197156.39 12434.134
boot4 = MclustBootstrap(mod4)
summary(boot4, what = "se")
## Bootstrap standard errors
## ----------------------------------------
## Model                        = E 
## Num. of mixture components   = 2 
## Replications                 = 999 
## 
## Mixing probabilities:
##          1          2 
## 0.04124314 0.04124314 
## 
## Means:
##          1          2 
## 0.04640443 0.06854613 
## 
## Variances:
##          1          2 
## 0.02375279 0.02375279
summary(boot4, what = "ci")
## Bootstrap confidence intervals
## ----------------------------------------
## Model                        = E 
## Num. of mixture components   = 2 
## Replications                 = 999 
## Confidence level             = 0.95 
## 
## Mixing probabilities:
##               1         2
## 2.5%  0.5381047 0.2981008
## 97.5% 0.7018992 0.4618953
## 
## Means:
##              1        2
## 2.5%  4.280819 6.185921
## 97.5% 4.461697 6.451439
## 
## Variances:
##               1         2
## 2.5%  0.1420548 0.1420548
## 97.5% 0.2352836 0.2352836

Dimension reduction

Clustering

mod1dr = MclustDR(mod1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: Mclust (VVV, 3)
##         
## Clusters  n
##        1 82
##        2 33
##        3 30
## 
## Estimated basis vectors:
##              Dir1     Dir2      Dir3
## glucose -0.986035  0.23503  0.958753
## insulin  0.157678 -0.10980 -0.284026
## sspg    -0.053589 -0.96577 -0.011029
## 
##               Dir1     Dir2      Dir3
## Eigenvalues  1.375  0.77745   0.65837
## Cum. %      48.919 76.57772 100.00000
plot(mod1dr, what = "pairs")

plot(mod1dr, what = "boundaries", ngrid = 200)

mod1dr = MclustDR(mod1, lambda = 1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: Mclust (VVV, 3)
##         
## Clusters  n
##        1 82
##        2 33
##        3 30
## 
## Estimated basis vectors:
##             Dir1     Dir2
## glucose  0.80928  0.92579
## insulin -0.56443 -0.19376
## sspg    -0.16272 -0.32461
## 
##                Dir1      Dir2
## Eigenvalues  1.0589   0.39905
## Cum. %      72.6294 100.00000
plot(mod1dr, what = "scatterplot")

plot(mod1dr, what = "boundaries", ngrid = 200)

Classification

mod2dr = MclustDR(mod2)
summary(mod2dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: EDDA 
##             
## Classes       n Model G
##   setosa     50   VEV 1
##   versicolor 50   VEV 1
##   virginica  50   VEV 1
## 
## Estimated basis vectors:
##                  Dir1      Dir2     Dir3     Dir4
## Sepal.Length  0.17425 -0.193663  0.64081 -0.46231
## Sepal.Width   0.45292  0.066561  0.34852  0.57110
## Petal.Length -0.61629 -0.311030 -0.42366  0.46256
## Petal.Width  -0.62024  0.928076  0.53703 -0.49613
## 
##                 Dir1     Dir2      Dir3       Dir4
## Eigenvalues  0.94747  0.68835  0.076141   0.052607
## Cum. %      53.69408 92.70374 97.018700 100.000000
plot(mod2dr, what = "scatterplot")

plot(mod2dr, what = "boundaries", ngrid = 200)

mod3dr = MclustDR(mod3)
summary(mod3dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification 
## -----------------------------------------------------------------
## 
## Mixture model type: MclustDA 
##              
## Classes         n Model G
##   counterfeit 100   EVE 2
##   genuine     100   XXX 1
## 
## Estimated basis vectors:
##              Dir1     Dir2      Dir3      Dir4       Dir5      Dir6
## Length   -0.10053 -0.32853  0.797138 -0.033156 -0.3177204  0.084867
## Left     -0.21757 -0.30448 -0.303012 -0.893163  0.3688689 -0.565765
## Right     0.29197 -0.01844 -0.495823  0.407884 -0.8616616  0.481501
## Bottom    0.57597  0.44534  0.120202 -0.034503  0.0042677 -0.078642
## Top       0.57549  0.38554  0.100993 -0.103770  0.1360968  0.625145
## Diagonal -0.44088  0.67235 -0.047607 -0.151104 -0.0445899  0.209460
## 
##                 Dir1     Dir2     Dir3     Dir4      Dir5       Dir6
## Eigenvalues  0.87241  0.55366  0.48558  0.13317  0.053192   0.027221
## Cum. %      41.05017 67.10202 89.95013 96.21627 98.719164 100.000000
plot(mod3dr, what = "scatterplot")

plot(mod3dr, what = "boundaries", ngrid = 200)