mclust
is a contributed R package for model-based clustering, classification, and density estimation based on finite normal mixture modelling. It provides functions for parameter estimation via the EM algorithm for normal mixture models with a variety of covariance structures, and functions for simulation from these models. Also included are functions that combine model-based hierarchical clustering, EM for mixture estimation and the Bayesian Information Criterion (BIC) in comprehensive strategies for clustering, density estimation and discriminant analysis. Additional functionalities are available for displaying and visualizing fitted models along with clustering, classification, and density estimation results.
This document gives a quick tour of mclust
(version 5.0.2) functionalities. It was written in R Markdown, using the knitr package for production. See help(package="mclust")
for further details and references provided by citation("mclust")
.
library(mclust)
## Package 'mclust' version 5.0.2
## Type 'citation("mclust")' for citing this R package in publications.
data(diabetes)
class = diabetes$class
table(class)
## class
## Chemical Normal Overt
## 36 76 33
X = diabetes[,-1]
head(X)
## glucose insulin sspg
## 1 80 356 124
## 2 97 289 117
## 3 105 319 143
## 4 90 356 199
## 5 90 323 240
## 6 86 381 157
clPairs(X, class)
BIC = mclustBIC(X)
plot(BIC)
summary(BIC)
## Best BIC values:
## VVV,3 VVE,3 EVE,4
## BIC -4760.091 -4775.53521 -4793.26174
## BIC diff 0.000 -15.44456 -33.17109
mod1 = Mclust(X)
summary(mod1, parameters = TRUE)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
##
## log.likelihood n df BIC ICL
## -2307.883 145 29 -4760.091 -4776.086
##
## Clustering table:
## 1 2 3
## 82 33 30
##
## Mixing probabilities:
## 1 2 3
## 0.5618662 0.2233077 0.2148261
##
## Means:
## [,1] [,2] [,3]
## glucose 91.41166 105.1978 219.40355
## insulin 358.82811 517.0420 1041.36946
## sspg 166.15629 320.7894 98.33493
##
## Variances:
## [,,1]
## glucose insulin sspg
## glucose 61.94482 98.79404 35.04002
## insulin 98.79404 2123.14811 387.06112
## sspg 35.04002 387.06112 2681.21187
## [,,2]
## glucose insulin sspg
## glucose 153.2207 795.5581 -494.7416
## insulin 795.5581 6513.7790 -2846.4416
## sspg -494.7416 -2846.4416 26074.9825
## [,,3]
## glucose insulin sspg
## glucose 6344.602 26160.77 -4433.706
## insulin 26160.768 122003.26 -22714.951
## sspg -4433.706 -22714.95 5892.777
plot(mod1, what = "classification")
table(class, mod1$classification)
##
## class 1 2 3
## Chemical 8 26 2
## Normal 74 2 0
## Overt 0 5 28
ICL = mclustICL(X)
summary(ICL)
## Best ICL values:
## VVV,3 VVE,3 EVE,4
## ICL -4776.086 -4793.2680 -4809.16854
## ICL diff 0.000 -17.1821 -33.08265
plot(ICL)
LRT = mclustBootstrapLRT(X, modelName = "VVV")
LRT
## Bootstrap sequential LRT for the number of mixture components
## -------------------------------------------------------------
## Model = VVV
## Replications = 999
## LRTS bootstrap p-value
## 1 vs 2 361.186445 0.001
## 2 vs 3 114.703559 0.001
## 3 vs 4 7.437806 0.944
data(iris)
class = iris$Species
table(class)
## class
## setosa versicolor virginica
## 50 50 50
X = iris[,1:4]
head(X)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
mod2 = MclustDA(X, class, modelType = "EDDA")
summary(mod2)
## ------------------------------------------------
## Gaussian finite mixture model for classification
## ------------------------------------------------
##
## EDDA model summary:
##
## log.likelihood n df BIC
## -187.7097 150 36 -555.8024
##
## Classes n Model G
## setosa 50 VEV 1
## versicolor 50 VEV 1
## virginica 50 VEV 1
##
## Training classification summary:
##
## Predicted
## Class setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 3
## virginica 0 0 50
##
## Training error = 0.02
plot(mod2, what = "scatterplot")
plot(mod2, what = "classification")
data(banknote)
class = banknote$Status
table(class)
## class
## counterfeit genuine
## 100 100
X = banknote[,-1]
head(X)
## Length Left Right Bottom Top Diagonal
## 1 214.8 131.0 131.1 9.0 9.7 141.0
## 2 214.6 129.7 129.7 8.1 9.5 141.7
## 3 214.8 129.7 129.7 8.7 9.6 142.2
## 4 214.8 129.7 129.6 7.5 10.4 142.0
## 5 215.0 129.6 129.7 10.4 7.7 141.8
## 6 215.7 130.8 130.5 9.0 10.1 141.4
mod3 = MclustDA(X, class)
summary(mod3)
## ------------------------------------------------
## Gaussian finite mixture model for classification
## ------------------------------------------------
##
## MclustDA model summary:
##
## log.likelihood n df BIC
## -646.0798 200 66 -1641.848
##
## Classes n Model G
## counterfeit 100 EVE 2
## genuine 100 XXX 1
##
## Training classification summary:
##
## Predicted
## Class counterfeit genuine
## counterfeit 100 0
## genuine 0 100
##
## Training error = 0
plot(mod3, what = "scatterplot")
plot(mod3, what = "classification")
unlist(cvMclustDA(mod2, nfold = 10)[2:3])
## error se
## 0.02666667 0.01474055
unlist(cvMclustDA(mod3, nfold = 10)[2:3])
## error se
## 0.005 0.005
data(acidity)
mod4 = densityMclust(acidity)
summary(mod4)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling
## -------------------------------------------------------
##
## Mclust E (univariate, equal variance) model with 2 components:
##
## log.likelihood n df BIC ICL
## -185.9493 155 4 -392.0723 -398.5554
##
## Clustering table:
## 1 2
## 98 57
plot(mod4, what = "BIC")
plot(mod4, what = "density", data = acidity, breaks = 15)
plot(mod4, what = "diagnostic", type = "cdf")
plot(mod4, what = "diagnostic", type = "qq")
data(faithful)
mod5 = densityMclust(faithful)
summary(mod5)
## -------------------------------------------------------
## Density estimation via Gaussian finite mixture modeling
## -------------------------------------------------------
##
## Mclust EEE (ellipsoidal, equal volume, shape and orientation) model with 3 components:
##
## log.likelihood n df BIC ICL
## -1126.361 272 11 -2314.386 -2360.865
##
## Clustering table:
## 1 2 3
## 130 97 45
plot(mod5, what = "BIC")
plot(mod5, what = "density")
plot(mod5, what = "density", type = "image",
col = "dodgerblue3", grid = 100)
plot(mod5, what = "density", type = "persp")
boot1 = MclustBootstrap(mod1)
summary(boot1, what = "se")
## ----------------------------------------------------------
## Bootstrap standard errors
## ----------------------------------------------------------
## Model = VVV
## Num. of mixture components = 3
## Replications = 999
## Type = nonparametric bootstrap
##
## Mixing probabilities:
## 1 2 3
## 0.05189752 0.04665731 0.03926142
##
## Means:
## 1 2 3
## glucose 1.019910 3.569284 17.48050
## insulin 7.015650 24.739314 78.21998
## sspg 6.849971 35.242230 17.99244
##
## Variances:
## [,,1]
## glucose insulin sspg
## glucose 11.26079 51.4028 53.66672
## insulin 51.40280 480.2080 346.70761
## sspg 53.66672 346.7076 534.22625
## [,,2]
## glucose insulin sspg
## glucose 61.10829 464.0365 498.6189
## insulin 464.03649 3544.7477 3283.7014
## sspg 498.61887 3283.7014 6660.8104
## [,,3]
## glucose insulin sspg
## glucose 1071.998 5922.253 1779.972
## insulin 5922.253 38015.518 11229.004
## sspg 1779.972 11229.004 3282.495
summary(boot1, what = "ci")
## ----------------------------------------------------------
## Bootstrap confidence intervals
## ----------------------------------------------------------
## Model = VVV
## Num. of mixture components = 3
## Replications = 999
## Type = nonparametric bootstrap
## Confidence level = 0.95
##
## Mixing probabilities:
## 1 2 3
## 2.5% 0.4559230 0.144385 0.1419683
## 97.5% 0.6505986 0.328485 0.2932974
##
## Means:
## [,,1]
## glucose insulin sspg
## 2.5% 89.28173 344.3242 152.0196
## 97.5% 93.29578 371.7835 179.7732
## [,,2]
## glucose insulin sspg
## 2.5% 98.43833 470.4595 257.9772
## 97.5% 113.27674 575.0149 392.4529
## [,,3]
## glucose insulin sspg
## 2.5% 185.9629 887.3448 67.02259
## 97.5% 254.3765 1197.6796 138.06227
##
## Variances:
## [,,1]
## glucose insulin sspg
## 2.5% 38.92586 1211.564 1645.638
## 97.5% 84.40661 3097.105 3754.882
## [,,2]
## glucose insulin sspg
## 2.5% 64.24949 1992.481 13614.83
## 97.5% 315.06702 16824.828 40150.08
## [,,3]
## glucose insulin sspg
## 2.5% 4104.170 57265.39 1613.676
## 97.5% 8168.312 199044.22 12757.872
boot4 = MclustBootstrap(mod4)
summary(boot4, what = "se")
## ----------------------------------------------------------
## Bootstrap standard errors
## ----------------------------------------------------------
## Model = E
## Num. of mixture components = 2
## Replications = 999
## Type = nonparametric bootstrap
##
## Mixing probabilities:
## 1 2
## 0.03974359 0.03974359
##
## Means:
## 1 2
## 0.04561595 0.06819682
##
## Variances:
## 1 2
## 0.02411315 0.02411315
summary(boot4, what = "ci")
## ----------------------------------------------------------
## Bootstrap confidence intervals
## ----------------------------------------------------------
## Model = E
## Num. of mixture components = 2
## Replications = 999
## Type = nonparametric bootstrap
## Confidence level = 0.95
##
## Mixing probabilities:
## 1 2
## 2.5% 0.5465184 0.3032894
## 97.5% 0.6967106 0.4534816
##
## Means:
## 1 2
## 2.5% 4.280676 6.177506
## 97.5% 4.454121 6.452424
##
## Variances:
## 1 2
## 2.5% 0.1415196 0.1415196
## 97.5% 0.2382068 0.2382068
mod1dr = MclustDR(mod1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVV, 3)
##
## Clusters n
## 1 82
## 2 33
## 3 30
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3
## glucose -0.986035 0.23503 0.958753
## insulin 0.157678 -0.10980 -0.284026
## sspg -0.053589 -0.96577 -0.011029
##
## Dir1 Dir2 Dir3
## Eigenvalues 1.375 0.77745 0.65837
## Cum. % 48.919 76.57772 100.00000
plot(mod1dr, what = "pairs")
plot(mod1dr, what = "boundaries", ngrid = 200)
mod1dr = MclustDR(mod1, lambda = 1)
summary(mod1dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: Mclust (VVV, 3)
##
## Clusters n
## 1 82
## 2 33
## 3 30
##
## Estimated basis vectors:
## Dir1 Dir2
## glucose 0.80928 0.92579
## insulin -0.56443 -0.19376
## sspg -0.16272 -0.32461
##
## Dir1 Dir2
## Eigenvalues 1.0589 0.39905
## Cum. % 72.6294 100.00000
plot(mod1dr, what = "scatterplot")
plot(mod1dr, what = "boundaries", ngrid = 200)
mod2dr = MclustDR(mod2)
summary(mod2dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: EDDA
##
## Classes n Model G
## setosa 50 VEV 1
## versicolor 50 VEV 1
## virginica 50 VEV 1
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3 Dir4
## Sepal.Length 0.17425 -0.193663 0.64081 -0.46231
## Sepal.Width 0.45292 0.066561 0.34852 0.57110
## Petal.Length -0.61629 -0.311030 -0.42366 0.46256
## Petal.Width -0.62024 0.928076 0.53703 -0.49613
##
## Dir1 Dir2 Dir3 Dir4
## Eigenvalues 0.94747 0.68835 0.076141 0.052607
## Cum. % 53.69408 92.70374 97.018700 100.000000
plot(mod2dr, what = "scatterplot")
plot(mod2dr, what = "boundaries", ngrid = 200)
mod3dr = MclustDR(mod3)
summary(mod3dr)
## -----------------------------------------------------------------
## Dimension reduction for model-based clustering and classification
## -----------------------------------------------------------------
##
## Mixture model type: MclustDA
##
## Classes n Model G
## counterfeit 100 EVE 2
## genuine 100 XXX 1
##
## Estimated basis vectors:
## Dir1 Dir2 Dir3 Dir4 Dir5 Dir6
## Length -0.10053 -0.32853 0.797138 -0.033156 -0.3177204 0.084867
## Left -0.21757 -0.30448 -0.303012 -0.893163 0.3688689 -0.565765
## Right 0.29197 -0.01844 -0.495823 0.407884 -0.8616616 0.481501
## Bottom 0.57597 0.44534 0.120202 -0.034503 0.0042677 -0.078642
## Top 0.57549 0.38554 0.100993 -0.103770 0.1360968 0.625145
## Diagonal -0.44088 0.67235 -0.047607 -0.151104 -0.0445899 0.209460
##
## Dir1 Dir2 Dir3 Dir4 Dir5 Dir6
## Eigenvalues 0.87241 0.55366 0.48558 0.13317 0.053192 0.027221
## Cum. % 41.05017 67.10202 89.95013 96.21627 98.719164 100.000000
plot(mod3dr, what = "scatterplot")
plot(mod3dr, what = "boundaries", ngrid = 200)