This file contains some examples of the functions related to the DDC routine. More specifically, DDC
and cellMap
will be illustrated.
library("cellWise")
library("gridExtra") # has grid.arrange()
# Default options for DDC:
DDCpars = list(fracNA = 0.5, numDiscrete = 3, precScale = 1e-12,
cleanNAfirst = "automatic", tolProb = 0.99,
corrlim = 0.5, combinRule = "wmean",
returnBigXimp = FALSE, silent = FALSE,
nLocScale = 25000, fastDDC = FALSE,
standType = "1stepM", corrType = "gkwls",
transFun = "wrap", nbngbrs = 100)
# A small list giving the same results:
DDCpars = list(fastDDC = FALSE)
First we illustrate the selection of columns and rows by DDC.
This is actually done by the function checkDataSet
which is called by DDC.
i = c(1,2,3,4,5,6,7,8,9)
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii")
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE)
V1 = c(1.3,NaN,4.5,2.7,20.0,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,-10,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5)
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s)
datafr
## i name logic V1 V2 V3 Vna Vdis V0s
## 1 1 aa TRUE 1.3 2.3 2 1 1 1.0
## 2 2 bb FALSE NaN NA Inf -4 1 1.5
## 3 3 cc TRUE 4.5 5.0 3 2 2 2.0
## 4 4 dd FALSE 2.7 6.0 -4 NaN 2 2.0
## 5 5 ee FALSE 20.0 7.0 5 3 3 2.0
## 6 6 ff TRUE 4.4 8.0 6 -Inf 3 2.0
## 7 7 gg TRUE -2.1 4.0 7 NA 3 2.0
## 8 8 hh TRUE 1.1 -10.0 -2 6 1 3.0
## 9 9 ii FALSE -5.0 0.5 8 5 2 2.5
DDCdatafr = DDC(datafr,DDCpars)
##
## The input data has 9 rows and 9 columns.
##
## The input data contained 2 non-numeric columns (variables).
## Their column names are:
##
## [1] name logic
##
## These columns will be ignored in the analysis.
## We continue with the remaining 7 numeric columns:
##
## [1] i V1 V2 V3 Vna Vdis V0s
##
## The data contained 1 columns that were identical to the case number
## (number of the row).
## Their column names are:
##
## [1] i
##
## These columns will be ignored in the analysis.
## We continue with the remaining 6 columns:
##
## [1] V1 V2 V3 Vna Vdis V0s
##
## The data contained 1 discrete columns with 3 or fewer values.
## Their column names are:
##
## [1] Vdis
##
## These columns will be ignored in the analysis.
## We continue with the remaining 5 columns:
##
## [1] V1 V2 V3 Vna V0s
##
## The data contained 1 columns with zero or tiny median absolute deviation.
## Their column names are:
##
## [1] V0s
##
## These columns will be ignored in the analysis.
## We continue with the remaining 4 columns:
##
## [1] V1 V2 V3 Vna
##
## The final data set we will analyze has 9 rows and 4 columns.
##
remX = DDCdatafr$remX; dim(remX)
## [1] 9 4
cellMap(D=remX, R=DDCdatafr$stdResid, rowlabels = 1:nrow(remX),
columnlabels = colnames(remX))
# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.
set.seed(12345) # for reproducibility
n <- 50; d <- 20
A <- matrix(0.9, d, d); diag(A) = 1
round(A[1:10,1:10],1) # true covariance matrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [2,] 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [3,] 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9 0.9
## [4,] 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9 0.9
## [5,] 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9 0.9
## [6,] 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9 0.9
## [7,] 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9 0.9
## [8,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9 0.9
## [9,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0 0.9
## [10,] 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 0.9 1.0
library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 50, FALSE)] <- 10
x[sample(1:(n * d), 50, FALSE)] <- -10
# When not specifying DDCpars in the call to DDC
# all defaults are used:
DDCx <- DDC(x)
##
## The input data has 50 rows and 20 columns.
cellMap(D=DDCx$remX, R=DDCx$stdResid, columnlabels = 1:d,
rowlabels = 1:n)
# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.
DDCx$DDCpars # These are the default options:
## $fracNA
## [1] 0.5
##
## $numDiscrete
## [1] 3
##
## $precScale
## [1] 1e-12
##
## $cleanNAfirst
## [1] "automatic"
##
## $tolProb
## [1] 0.99
##
## $corrlim
## [1] 0.5
##
## $combinRule
## [1] "wmean"
##
## $returnBigXimp
## [1] FALSE
##
## $silent
## [1] FALSE
##
## $nLocScale
## [1] 25000
##
## $fastDDC
## [1] FALSE
##
## $standType
## [1] "1stepM"
##
## $corrType
## [1] "gkwls"
##
## $transFun
## [1] "wrap"
##
## $nbngbrs
## [1] 100
names(DDCx)
## [1] "DDCpars" "colInAnalysis" "rowInAnalysis"
## [4] "namesNotNumeric" "namesCaseNumber" "namesNAcol"
## [7] "namesNArow" "namesDiscrete" "namesZeroScale"
## [10] "remX" "locX" "scaleX"
## [13] "Z" "nbngbrs" "ngbrs"
## [16] "robcors" "robslopes" "deshrinkage"
## [19] "Xest" "scalestres" "stdResid"
## [22] "indcells" "Ti" "medTi"
## [25] "madTi" "indrows" "indall"
## [28] "indNAs" "Ximp"
# We will now go through these outputs one by one:
DDCx$colInAnalysis # all columns X1,...,X20 remain:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## X19 X20
## 19 20
DDCx$rowInAnalysis # all rows 1,...,50 remain:
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
DDCx$namesNotNumeric # no non-numeric columns:
## NULL
DDCx$namesCaseNumber # no column was the case number:
## NULL
DDCx$namesNAcol # no columns with too many NA's:
## NULL
DDCx$namesNArow # no columns with too many NA's:
## NULL
DDCx$namesDiscrete # no discrete columns:
## NULL
DDCx$namesZeroScale # no columns with scale = 0:
## NULL
dim(DDCx$remX) # remaining data matrix
## [1] 50 20
round(DDCx$locX,2) # robust location estimates of all 20 columns:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## -0.15 -0.23 -0.18 -0.31 -0.17 -0.25 -0.63 -0.33 -0.40 -0.23 -0.31 -0.32
## X13 X14 X15 X16 X17 X18 X19 X20
## -0.20 -0.28 -0.35 -0.23 -0.26 -0.23 -0.33 -0.37
round(DDCx$scaleX,2) # robust scale estimates of all 20 columns:
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
## 1.33 1.54 1.47 1.21 1.33 1.29 1.21 1.25 1.21 1.38 1.43 1.26 1.06 1.38 1.24
## X16 X17 X18 X19 X20
## 1.27 1.50 1.20 1.12 1.31
round(DDCx$Z[1:10,1:10],1)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -0.5 -0.6 -0.3 -0.2 0.0 -0.1 -0.2 -0.6 0.2 0.2
## [2,] 0.0 -0.1 -0.5 NA -0.7 -0.1 -0.2 -7.7 -7.9 -0.2
## [3,] 0.2 0.1 0.3 0.5 0.2 7.9 0.4 NA 0.6 7.4
## [4,] 0.5 0.4 0.1 0.2 0.6 -7.5 0.7 0.6 0.7 7.4
## [5,] -0.6 -0.2 -0.2 -0.1 -0.2 -0.2 0.6 -0.5 -7.9 -0.6
## [6,] 1.6 1.3 1.2 1.7 1.2 1.6 2.3 1.3 1.6 1.7
## [7,] -0.2 -0.3 -6.7 -0.5 -0.7 -0.3 -0.1 -0.5 -7.9 0.0
## [8,] -7.4 0.2 0.7 0.3 0.6 0.5 NA -7.7 0.5 0.1
## [9,] NA 0.1 0.2 0.7 0.5 0.6 0.4 0.2 0.0 0.4
## [10,] 0.3 0.6 0.7 0.6 1.1 0.6 1.4 -7.7 1.7 7.4
# Robustly standardized dataset. Due to the high correlations,
# cells in the same row look similar (except for outlying cells).
DDCx$nbngbrs
## [1] 19
# For each column the code looked for up to 19 non-self neighbors (highly correlated columns).
# It goes through all of them, unless fastDDC is set to TRUE.
DDCx$ngbrs[1:3,]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 11 2 14 12 13 17 16 18 10 4 8 19
## [2,] 2 11 17 16 13 15 12 1 8 4 10 20 14
## [3,] 3 13 20 5 14 16 6 11 9 17 4 2 12
## [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 15 20 6 3 9 5 7
## [2,] 9 3 5 19 6 18 7
## [3,] 19 10 8 15 1 18 7
# Shows the neighbors, e.g. the nearest non-self neighbor of X1 is X11, then X2,...
round(DDCx$robcors[1:3,],2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 0.93 0.93 0.93 0.93 0.92 0.91 0.91 0.90 0.90 0.90 0.90 0.90
## [2,] 1 0.94 0.94 0.93 0.93 0.93 0.93 0.93 0.93 0.92 0.92 0.91 0.91
## [3,] 1 0.95 0.94 0.94 0.93 0.93 0.92 0.91 0.91 0.91 0.91 0.91 0.90
## [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 0.89 0.88 0.88 0.87 0.87 0.87 0.84
## [2,] 0.91 0.91 0.90 0.90 0.90 0.89 0.88
## [3,] 0.89 0.89 0.89 0.89 0.87 0.87 0.86
# Robust correlations with these neighbors. In each row the correlations
# are sorted by decreasing absolute value.
round(DDCx$robslopes[1:3,],2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## [1,] 1 0.91 1.02 0.96 0.85 0.75 0.97 0.83 0.78 0.92 0.83 0.86 0.85
## [2,] 1 0.87 0.87 0.76 0.72 0.73 0.76 0.84 0.78 0.70 0.85 0.84 0.83
## [3,] 1 0.70 0.80 0.81 0.82 0.72 0.78 0.84 0.73 0.82 0.69 0.87 0.73
## [,14] [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 0.81 0.87 0.83 0.97 0.85 0.80 0.70
## [2,] 0.73 0.94 0.76 0.74 0.72 0.71 0.67
## [3,] 0.69 0.79 0.72 0.69 0.79 0.68 0.65
# For each column, the slope of each neighbor predicting it.
# For instance, X1 is predicted by its first neighbor with
# slope 0.91 and by its second neighbor with slope 1.02 .
round(DDCx$deshrinkage,2)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
## 1.09 1.09 1.09 1.07 1.08 1.09 1.14 1.07 1.08 1.07 1.09 1.09 1.10 1.08 1.09
## X16 X17 X18 X19 X20
## 1.11 1.09 1.09 1.12 1.10
# For each column, the factor by which its prediction is multiplied.
round(DDCx$Xest[1:12,1:10],2) # the estimated cells of remX:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -0.45 -0.55 -0.47 -0.61 -0.47 -0.56 -0.95 -0.62 -0.68 -0.53
## [2,] -0.51 -0.63 -0.56 -0.70 -0.57 -0.64 -1.04 -0.70 -0.77 -0.61
## [3,] 0.24 0.19 0.20 0.09 0.24 0.17 -0.19 0.06 -0.01 0.17
## [4,] 0.53 0.48 0.46 0.38 0.53 0.46 0.10 0.34 0.26 0.45
## [5,] -0.46 -0.54 -0.47 -0.61 -0.48 -0.54 -0.91 -0.62 -0.68 -0.54
## [6,] 1.81 1.85 1.72 1.71 1.86 1.84 1.54 1.61 1.50 1.81
## [7,] -0.46 -0.56 -0.49 -0.63 -0.50 -0.58 -0.95 -0.65 -0.70 -0.55
## [8,] 0.35 0.30 0.32 0.21 0.37 0.30 -0.09 0.17 0.09 0.28
## [9,] 0.36 0.30 0.30 0.21 0.36 0.29 -0.08 0.16 0.08 0.29
## [10,] 0.93 0.94 0.90 0.82 0.99 0.93 0.60 0.77 0.70 0.90
## [11,] 0.23 0.17 0.16 0.07 0.21 0.14 -0.23 0.04 -0.04 0.15
## [12,] -1.54 -1.71 -1.56 -1.75 -1.64 -1.73 -2.15 -1.70 -1.74 -1.67
round(DDCx$stdResid[1:12,1:10],1)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -0.8 -1.7 -0.3 0.2 0.6 0.6 0.2 -1.1 1.6 1.2
## [2,] 0.8 0.6 -0.7 NA -1.3 0.7 0.4 -24.4 -28.7 0.2
## [3,] -0.3 -0.5 0.2 0.6 -0.3 26.7 0.2 NA 1.2 21.5
## [4,] -0.1 -0.3 -1.1 -1.3 0.4 -28.4 0.2 0.3 0.7 20.9
## [5,] -1.2 0.1 -0.2 0.5 0.1 0.1 2.6 -0.7 -29.0 -1.1
## [6,] 0.3 -0.4 -0.3 0.0 -1.0 0.0 1.5 -0.8 0.1 0.6
## [7,] 0.2 -0.3 -21.0 -0.9 -1.6 -0.2 0.5 -0.8 -28.9 0.7
## [8,] -23.9 -0.4 1.0 -0.4 0.6 0.3 NA -26.7 0.5 -0.9
## [9,] NA -0.8 -0.4 1.1 0.2 0.7 -0.1 -0.7 -1.4 0.0
## [10,] -1.5 -0.4 -0.1 -1.3 0.9 -1.0 1.1 -28.3 2.9 19.9
## [11,] 0.6 0.4 -1.2 0.0 -0.6 -27.5 0.6 1.1 -0.4 21.6
## [12,] -0.1 -0.3 -1.2 -0.1 -1.2 -1.0 1.0 2.1 1.1 0.3
# The standardized residuals of the cells. Note the NA's and some
# large positive and negative cell residuals.
qqnorm(as.vector(DDCx$stdResid)) # Note the far outliers on both sides:
as.vector(DDCx$indcells) # indices of the cells that were flagged by DDC:
## [1] 8 25 28 32 47 64 74 75 80 85 89 90 91 107
## [15] 113 116 121 129 137 141 177 179 184 187 191 197 215 228
## [29] 229 233 238 253 254 261 269 282 290 293 300 305 316 324
## [43] 352 358 360 377 384 402 405 407 410 453 454 460 461 478
## [57] 501 504 505 513 575 578 583 597 604 629 639 655 660 676
## [71] 677 697 740 750 757 775 776 781 789 791 800 801 813 814
## [85] 827 838 846 855 856 902 910 938 954 972 975 976 985 989
## [99] 994 1000
plot(DDCx$Ti) # the Ti values of the rows. None are high.
qqnorm(DDCx$Ti) # no outliers
DDCx$medTi # median of the raw Ti (used to standardize Ti):
## [1] -0.000131909
DDCx$madTi # median absolute deviation of the raw Ti (used to standardize Ti):
## [1] 0.08437571
DDCx$indrows # numeric(0) means no rows are flagged:
## numeric(0)
as.vector(DDCx$indall) # indices of the flagged cells, including those in flagged rows:
## [1] 8 25 28 32 47 64 74 75 80 85 89 90 91 107
## [15] 113 116 121 129 137 141 177 179 184 187 191 197 215 228
## [29] 229 233 238 253 254 261 269 282 290 293 300 305 316 324
## [43] 352 358 360 377 384 402 405 407 410 453 454 460 461 478
## [57] 501 504 505 513 575 578 583 597 604 629 639 655 660 676
## [71] 677 697 740 750 757 775 776 781 789 791 800 801 813 814
## [85] 827 838 846 855 856 902 910 938 954 972 975 976 985 989
## [99] 994 1000
as.vector(DDCx$indNAs) # indices of the missing cells:
## [1] 9 29 39 40 41 83 99 148 150 152 165 196 217 296 308 349 353
## [18] 369 376 383 444 466 471 531 587 589 609 622 678 680 684 737 743 749
## [35] 793 805 821 866 880 890 905 924 958 963 980
round(DDCx$Ximp[1:10,1:10],2)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] -0.81 -1.21 -0.59 -0.56 -0.23 -0.33 -0.86 -1.05 -0.15 0.01
## [2,] -0.15 -0.38 -0.87 -0.70 -1.07 -0.37 -0.89 -0.70 -0.77 -0.53
## [3,] 0.12 -0.01 0.31 0.28 0.13 0.17 -0.12 0.06 0.37 0.17
## [4,] 0.48 0.36 -0.02 -0.08 0.68 0.46 0.18 0.45 0.50 0.45
## [5,] -0.97 -0.50 -0.55 -0.45 -0.45 -0.51 -0.91 -0.90 -0.68 -1.05
## [6,] 1.94 1.71 1.59 1.70 1.47 1.85 2.15 1.32 1.54 2.08
## [7,] -0.39 -0.67 -0.49 -0.94 -1.11 -0.64 -0.77 -0.94 -0.70 -0.25
## [8,] 0.35 0.15 0.78 0.08 0.60 0.39 -0.09 0.17 0.25 -0.13
## [9,] 0.36 -0.02 0.10 0.57 0.44 0.55 -0.14 -0.09 -0.36 0.26
## [10,] 0.28 0.77 0.87 0.38 1.32 0.55 1.06 0.77 0.70 0.90
# The imputed matrix. Both the cellwise outliers and the missing values
# are replaced by their predicted values.
round((DDCx$Ximp - DDCx$remX)[1:10,1:10],2)
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## 1 0.00 0 0.00 0 0 0.00 0.00 0.00 0.00 0.00
## 2 0.00 0 0.00 NA 0 0.00 0.00 9.30 9.23 0.00
## 3 0.00 0 0.00 0 0 -9.83 0.00 NA 0.00 -9.83
## 4 0.00 0 0.00 0 0 10.46 0.00 0.00 0.00 -9.55
## 5 0.00 0 0.00 0 0 0.00 -1.05 0.00 9.32 0.00
## 6 0.00 0 0.00 0 0 0.00 0.00 0.00 0.00 0.00
## 7 0.00 0 9.51 0 0 0.00 0.00 0.00 9.30 0.00
## 8 10.35 0 0.00 0 0 0.00 NA 10.17 0.00 0.00
## 9 NA 0 0.00 0 0 0.00 0.00 0.00 0.00 0.00
## 10 0.00 0 0.00 0 0 0.00 0.00 10.77 -0.93 -9.10
# The nonzero values and the NA's correspond to imputed cells.
The Top Gear data contains information on 297 cars.
library(robustHD)
data(TopGear)
dim(TopGear)
## [1] 297 32
rownames(TopGear)[1:13] # "1" to "297" are not useful names
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13"
rownames(TopGear) = paste(TopGear[,1],TopGear[,2])
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'
# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)
# Run the DDC method:
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCtransTG = DDC(transTG,DDCpars)
##
## The final data set we will analyze has 296 rows and 11 columns.
##
# With DDCpars = list(fastDDC = FALSE, silent = FALSE) we obtain more information:
#
# The input data has 297 rows and 31 columns.
#
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#
# [1] Maker Model Type Fuel
# [5] DriveWheel AdaptiveHeadlights AdjustableSteering AlarmSystem
# [9] Automatic Bluetooth ClimateControl CruiseControl
# [13] ElectricSeats Leather ParkingSensors PowerSteering
# [17] SatNav ESP Origin
#
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#
# [1] Price Cylinders Displacement BHP Torque Acceleration TopSpeed
# [8] MPG Weight Length Width Height
#
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#
# [1] Citroen C5 Tourer
#
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#
# [1] Alfa Romeo Giulietta Alfa Romeo MiTo
# .......
# [295] Volvo XC70 Volvo XC90
#
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
#
# [1] Cylinders
#
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
#
# [1] Price Displacement BHP Torque Acceleration TopSpeed MPG
# [8] Weight Length Width Height
#
# The final data set we will analyze has 296 rows and 11 columns.
remX = DDCtransTG$remX # the remaining part of the dataset
dim(remX)
## [1] 296 11
colSums(is.na(remX)) # There are still NAs, mainly in `Weight':
## Price Displacement BHP Torque Acceleration
## 0 8 3 3 0
## TopSpeed MPG Weight Length Width
## 3 11 32 10 15
## Height
## 10
# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE),
apply(remX,2,mad,na.rm = TRUE))
dim(standX)
## [1] 296 11
round(standX[1:5,],1) # has NAs where remX does
## Price Displacement BHP Torque Acceleration
## Alfa Romeo Giulietta -0.4 -0.4 -0.7 0.0 0.6
## Alfa Romeo MiTo -0.9 -0.7 -0.7 -1.6 0.4
## Aston Martin Cygnet 0.3 -0.8 -0.8 -1.7 0.7
## Aston Martin DB9 2.7 2.1 1.9 1.2 -1.2
## Aston Martin DB9 Volante 2.8 2.1 1.9 1.2 -1.2
## TopSpeed MPG Weight Length Width Height
## Alfa Romeo Giulietta -0.5 1.0 -0.3 -0.3 -0.2 -0.2
## Alfa Romeo MiTo -0.4 0.1 -1.0 -0.9 -1.1 -0.3
## Aston Martin Cygnet -0.9 0.6 -1.3 -3.1 -1.5 0.1
## Aston Martin DB9 2.0 -1.7 0.8 0.6 NA -1.6
## Aston Martin DB9 Volante 2.0 -1.7 1.0 0.6 NA -1.6
transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # has NAs in outlying cells as well:
## Price Displacement BHP Torque Acceleration
## Alfa Romeo Giulietta 10.0 7.4 4.7 5.5 11.3
## Alfa Romeo MiTo 9.6 7.2 4.7 4.6 10.7
## Aston Martin Cygnet 10.3 7.2 4.6 4.5 11.8
## Aston Martin DB9 NA 8.7 6.2 6.1 4.6
## Aston Martin DB9 Volante NA 8.7 6.2 6.1 4.6
## TopSpeed MPG Weight Length Width Height
## Alfa Romeo Giulietta 4.7 64 1385 4351 1798 1465
## Alfa Romeo MiTo 4.8 49 1090 4063 1720 1446
## Aston Martin Cygnet 4.7 56 988 NA 1680 1500
## Aston Martin DB9 5.2 19 1785 4720 NA 1282
## Aston Martin DB9 Volante 5.2 19 1890 4720 NA 1282
# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
tempX$Price = tempX$Price/1000 # to avoid printing long numbers
dim(tempX)
## [1] 296 11
columnlabels = colnames(tempX)
rowlabels = rownames(tempX)
# Show the following 17 cars in the cellmap:
showrows = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)
# Make two ggplot2 objects:
ggpcol = cellMap(D=tempX,
R=standX,
indcells=which(is.na(transTGcol)),
indrows=integer(0),
columnlabels=columnlabels,
rowlabels=rowlabels,
mTitle="By column",
showrows=showrows,
showVals="D",
adjustrowlabels=0.5)
plot(ggpcol)
ggpDDC = cellMap(D=tempX,
R=DDCtransTG$stdResid,
indcells=DDCtransTG$indcells,
indrows=DDCtransTG$indrows,
columnlabels=columnlabels,
rowlabels=rowlabels,
mTitle="DetectDeviatingCells",
showrows=showrows,
showVals="D",
adjustrowlabels=0.5)
plot(ggpDDC)
# Creating the pdf:
# pdf("cellMap_TopGear.pdf", width = 20, height = 15 )
# gridExtra::grid.arrange(ggpcol,ggpDDC,nrow=1) # combines 2 plots in a figure
# dev.off()
We now consider the 17 cars shown in the cellmap as a new' dataset.
# Top Gear dataset: prediction of "new" data
############################################
# For comparison we first remake the cell map of the entire dataset, but now
# showing the values of the residuals instead of the data values:
dim(remX) # 296 11
## [1] 296 11
rowlabels = rownames(remX)
columnlabels = colnames(remX)
ggpDDC = cellMap(D=remX,
R=DDCtransTG$stdResid,
indcells=DDCtransTG$indcells,
indrows=DDCtransTG$indrows,
standOD=NULL,
showVals="R",
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="DDC",
showrows=showrows,
adjustrowlabels=0.5,
outlyingGrad = 1)
plot(ggpDDC)
initX = remX[-showrows,]
dim(initX) # 279 11
## [1] 279 11
# Fit initX:
DDCinitX = DDC(initX,DDCpars=DDCpars)
##
## The final data set we will analyze has 278 rows and 11 columns.
##
newX = remX[showrows,]
dim(newX) # 17 11
## [1] 17 11
# Make predictions by DDCpredict.
# Its inputs are:
# Xnew : the new data (test data)
# InitialDDC : Must be provided.
# DDCpars : the input options to be used for the prediction.
# By default the options of InitialDDC are used.
predictDDC = DDCpredict(newX,DDCinitX)
names(DDCinitX)
## [1] "DDCpars" "colInAnalysis" "rowInAnalysis"
## [4] "namesNotNumeric" "namesCaseNumber" "namesNAcol"
## [7] "namesNArow" "namesDiscrete" "namesZeroScale"
## [10] "remX" "locX" "scaleX"
## [13] "Z" "nbngbrs" "ngbrs"
## [16] "robcors" "robslopes" "deshrinkage"
## [19] "Xest" "scalestres" "stdResid"
## [22] "indcells" "Ti" "medTi"
## [25] "madTi" "indrows" "indall"
## [28] "indNAs" "Ximp"
# For comparison with:
names(predictDDC) # Fewer, since DDCpredict does not call checkDataSet:
## [1] "DDCpars" "locX" "scaleX" "Z" "nbngbrs"
## [6] "ngbrs" "robcors" "robslopes" "deshrinkage" "Xest"
## [11] "scalestres" "stdResid" "indcells" "Ti" "medTi"
## [16] "madTi" "indrows" "indNAs" "indall" "Ximp"
# If you specify the parameters the result is the same:
predictDDC2 = DDCpredict(newX,DDCinitX,DDCpars=DDCpars)
all.equal(predictDDC,predictDDC2) # TRUE
## [1] TRUE
ggpnew = cellMap(D=newX,
R=predictDDC$stdResid,
indcells=predictDDC$indcells,
indrows=predictDDC$indrows,
standOD=NULL,
showVals="R",
rowlabels=rowlabels[showrows],
columnlabels=columnlabels,
mTitle="DDCpredict",
adjustrowlabels=0.5,
outlyingGrad = 1)
plot(ggpnew) # Looks quite similar to the result using the entire dataset:
# Creating the pdf:
# pdf("TopGear_DDCpredict.pdf",width=20,height=15)
# gridExtra::grid.arrange(ggpDDC,ggpnew,nrow=1)
# dev.off()
The philips data contains 9 measurements of TV parts from a production line.
data(philips)
dim(philips)
## [1] 677 9
colnames(philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")
DDCphilips = DDC(philips,DDCpars)
qqnorm(as.vector(DDCphilips$Z)) # rather gaussian, here we only see 2 outliers:
round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
## [1,] -0.9 2.2 0.0 1.2 0.8 -2.9 2.1 0.1 -1.4
## [2,] -0.5 2.0 0.0 3.9 0.8 -1.1 3.5 -0.4 -1.7
## [3,] -1.0 2.3 -0.4 0.7 0.6 -2.0 1.4 0.0 -1.7
## [4,] -0.6 2.3 -0.1 3.8 1.1 -0.9 3.4 -0.5 -2.0
## [5,] -0.6 2.8 -0.2 3.4 0.9 -0.9 3.4 0.2 -1.9
## [6,] -0.7 2.6 0.0 4.4 1.0 -1.1 4.2 -0.1 -2.0
## [7,] 1.0 -0.1 -1.0 -0.3 -1.2 -1.0 1.4 0.8 -0.6
## [8,] -0.4 -1.4 -0.7 1.8 -0.7 -2.8 1.9 -0.3 -1.7
## [9,] 1.0 -0.2 0.0 -0.1 -1.2 -2.1 1.6 1.0 -0.9
## [10,] 0.2 -1.3 -0.1 1.0 -0.5 -2.9 2.0 -0.1 -1.5
## [11,] -0.1 -0.3 -0.1 0.2 -1.1 -1.5 1.9 0.8 -0.8
## [12,] -0.1 -0.4 -1.5 2.5 0.0 -3.4 2.3 0.3 -2.5
DDCphilips$indcells # indices of the cells that were flagged:
## [1] 55 70 95 104 116 144 151 156 161 170 174 175 324 468
## [15] 682 683 712 747 750 792 985 986 987 988 989 991 992 1006
## [29] 1007 1008 1010 1011 1196 1199 1450 1452 1455 1474 1529 1787 2033 2035
## [43] 2036 2037 2045 2047 2094 2101 2114 2116 2126 2135 2267 2312 2355 2365
## [57] 2464 2499 2592 2743 2775 2777 3120 3216 3226 3227 3228 3248 3386 3393
## [71] 3395 3397 3399 3401 3412 3413 3414 3415 3416 3417 3423 3424 3425 3426
## [85] 3427 3428 3430 3431 3437 3445 3456 3457 3460 3464 3465 3466 3468 3469
## [99] 3470 3990 4064 4066 4067 4068 4076 4078 4088 4145 4147 4792 4794 5036
## [113] 5037 5168 5169 5170 5171 5172 5174 5175 5176 5230 5238 5241 5246 5248
## [127] 5250 5253 5256 5258 5259 5260 5262 5263 5265 5267 5268 5270 5510 5511
## [141] 5512 5513 5514 5515 5516 5517 5518 5519 5520 5569 5869 5873 5882 5890
## [155] 5902
DDCphilips$indrows # flagged rows:
## [1] 26 56 84 334 491
library(robustbase) # for covMcd
MCDphilips = robustbase::covMcd(philips)
indrowsMCD = which(mahalanobis(philips,MCDphilips$center,
MCDphilips$cov) > qchisq(0.975,df=d))
plot(sqrt(mahalanobis(philips,MCDphilips$center,MCDphilips$cov)),
main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9))) # this horizontal line is the cutoff.
# dev.copy(pdf,"Figure_philips_left.pdf",width=10,height=4)
# dev.off()
# cellMaps with rectangular blocks:
n = nrow(philips)
nrowsinblock = 15
rowlabels = 1:n
d = ncol(philips)
ncolumnsinblock = 1
columnlabels = colnames(philips)
ggpMCDphilips = cellMap(D=philips,
R=matrix(0,n,d),
indcells=integer(0),
indrows=indrowsMCD,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="MCD",
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
autolabel=T)
plot(ggpMCDphilips)
ggpDDCphilips = cellMap(D=philips,
R=DDCphilips$stdResid,
indcells=DDCphilips$indcells,
indrows=DDCphilips$indrows,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="DetectDeviatingCells",
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
autolabel=T,
adjustrowlabels=1)
plot(ggpDDCphilips)
# dev.copy(pdf,"Figure_philips_right.pdf",width=6,height=12)
# dev.off()
This dataset contains the mortality of French males in the years 1816 to 2013.
data(mortality)
dim(mortality)
## [1] 198 91
# 198 91
rownames(mortality)[1:5]
## [1] "1816" "1817" "1818" "1819" "1820"
colnames(mortality)[1:5]
## [1] "0" "1" "2" "3" "4"
DDCmortality = DDC(mortality,DDCpars) # 1 second
remX = DDCmortality$remX
dim(remX)
## [1] 198 91
library(rrcov) # contains ROBPCA
PCAmortality = rrcov::PcaHubert(mortality,alpha=0.75,scale=FALSE)
n = nrow(remX)
nrowsinblock = 5
rowlabels = rownames(remX)
d = ncol(remX)
ncolumnsinblock = 5
columnlabels = colnames(remX)
ggpROBPCA = cellMap(D=remX,
R=matrix(0,n,d),
indrows=which(PCAmortality@flag==FALSE),
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="By row",
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
rowtitle = "Years",
columntitle = "Age",
sizetitles = 2.0,
autolabel=T)
plot(ggpROBPCA)
ggpDDC = cellMap(D=remX,
R=DDCmortality$stdResid,
indcells=DDCmortality$indcells,
indrows=DDCmortality$indrows,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="DetectDeviatingCells",
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
rowtitle = "Years",
columntitle = "Age",
sizetitles = 2.0,
autolabel=T)
plot(ggpDDC) # Leads to a detailed interpretation:
# pdf("cellmap_mortality.pdf",width=14,height=12)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
# dev.off()
The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.
data(glass)
DDCglass = DDC(glass,DDCpars) # takes 8 seconds
##
## The final data set we will analyze has 180 rows and 737 columns.
##
remX = DDCglass$remX
# With DDCpars$silent = FALSE we obtain more information:
#
# The input data has 180 rows and 750 columns.
#
# The data contained 11 discrete columns with 3 or fewer values.
# Their column names are:
#
# [1] V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
#
# These columns will be ignored in the analysis.
# We continue with the remaining 739 columns:
#
# [1] V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25
# ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
# The data contained 2 columns with zero or tiny median absolute deviation.
# Their column names are:
#
# [1] V12 V13
#
# These columns will be ignored in the analysis.
# We continue with the remaining 737 columns:
#
# [1] V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27
# ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
# The final data set we will analyze has 180 rows and 737 columns.
dim(remX)
## [1] 180 737
fastDDCpars = list(fastDDC = TRUE, silent = TRUE)
fastDDCglass = DDC(glass, fastDDCpars) # takes 2 seconds
##
## The final data set we will analyze has 180 rows and 737 columns.
##
remXfast = fastDDCglass$remX
all.equal(remX,remXfast) # The remaining data is the same:
## [1] TRUE
library(rrcov) # contains ROBPCA
PCAglass = rrcov::PcaHubert(remX,alpha=0.75,scale=FALSE)
n = nrow(remX)
nrowsinblock = 5
rowtitle = "glass samples"
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/ncolumnsinblock)] = "n";
# rowlabels
d = ncol(remX)
ncolumnsinblock = 5
columntitle = "wavelengths"
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/nrowsinblock)] = "d"
# columnlabels
ggpROBPCA = cellMap(D=remX,
R=matrix(0,n,d),
indcells=integer(0),
indrows=which(PCAglass@flag==FALSE),
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="By row",
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
columnangle=0,
rowtitle=rowtitle,
columntitle=columntitle,
sizetitles=1.5,
autolabel=F)
plot(ggpROBPCA)
ggpDDC = cellMap(D=remX,
R=DDCglass$stdResid,
indcells=DDCglass$indcells,
indrows=DDCglass$indrows,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="DDC",
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
columnangle=0,
rowtitle=rowtitle,
columntitle=columntitle,
sizetitles=1.5,
autolabel=F)
plot(ggpDDC)
# pdf("cellmap_glass_ROBPCA_DDC.pdf",width=16,height=10)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
# dev.off()
ggpfastDDC = cellMap(D=remXfast,
R=fastDDCglass$stdResid,
indcells=fastDDCglass$indcells,
indrows=fastDDCglass$indrows,
rowlabels=rowlabels,
columnlabels=columnlabels,
mTitle="fast DDC",
nrowsinblock=nrowsinblock,
ncolumnsinblock=ncolumnsinblock,
columnangle=0,
rowtitle=rowtitle,
columntitle=columntitle,
sizetitles=1.5,
autolabel=F)
plot(ggpfastDDC)
# pdf("cellmap_glass_DDC_fastDDC.pdf",width=16,height=10)
# gridExtra::grid.arrange(ggpDDC,ggpfastDDC,ncol=1)
# dev.off()