The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

Introduction

This file contains some examples of the functions related to the DDC routine. More specifically, DDC and cellMap will be illustrated.

library("cellWise")
library("gridExtra") # has grid.arrange()

# Default options for DDC:
DDCpars = list(fracNA = 0.5, numDiscrete = 3, precScale = 1e-12,
               cleanNAfirst = "automatic", tolProb = 0.99, 
               corrlim = 0.5, combinRule = "wmean",
               returnBigXimp = FALSE, silent = FALSE,
               nLocScale = 25000, fastDDC = FALSE,
               standType = "1stepM", corrType = "gkwls",
               transFun = "wrap", nbngbrs = 100)

# A small list giving the same results:
DDCpars = list(fastDDC = FALSE)

Example with row and column selection

First we illustrate the selection of columns and rows by DDC. This is actually done by the function checkDataSet which is called by DDC.

i = c(1,2,3,4,5,6,7,8,9) 
name = c("aa","bb","cc","dd","ee","ff","gg","hh","ii") 
logic = c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE) 
V1 = c(1.3,NaN,4.5,2.7,20.0,4.4,-2.1,1.1,-5)
V2 = c(2.3,NA,5,6,7,8,4,-10,0.5)
V3 = c(2,Inf,3,-4,5,6,7,-2,8)
Vna = c(1,-4,2,NaN,3,-Inf,NA,6,5)
Vdis = c(1,1,2,2,3,3,3,1,2)
V0s = c(1,1.5,2,2,2,2,2,3,2.5) 
datafr = data.frame(i,name,logic,V1,V2,V3,Vna,Vdis,V0s) 
datafr

##   i name logic   V1    V2  V3  Vna Vdis V0s
## 1 1   aa  TRUE  1.3   2.3   2    1    1 1.0
## 2 2   bb FALSE  NaN    NA Inf   -4    1 1.5
## 3 3   cc  TRUE  4.5   5.0   3    2    2 2.0
## 4 4   dd FALSE  2.7   6.0  -4  NaN    2 2.0
## 5 5   ee FALSE 20.0   7.0   5    3    3 2.0
## 6 6   ff  TRUE  4.4   8.0   6 -Inf    3 2.0
## 7 7   gg  TRUE -2.1   4.0   7   NA    3 2.0
## 8 8   hh  TRUE  1.1 -10.0  -2    6    1 3.0
## 9 9   ii FALSE -5.0   0.5   8    5    2 2.5

DDCdatafr = DDC(datafr,DDCpars)

##  
##  The input data has 9 rows and 9 columns.
##  
##  The input data contained 2 non-numeric columns (variables).
##  Their column names are:
##  
## [1] name  logic
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 7 numeric columns:
##  
## [1] i    V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 columns that were identical to the case number
##  (number of the row).
##  Their column names are:
##  
## [1] i
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 6 columns:
##  
## [1] V1   V2   V3   Vna  Vdis V0s 
##  
##  The data contained 1 discrete columns with 3 or fewer values.
##  Their column names are:
##  
## [1] Vdis
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 5 columns:
##  
## [1] V1  V2  V3  Vna V0s
##  
##  The data contained 1 columns with zero or tiny median absolute deviation.
##  Their column names are:
##  
## [1] V0s
##  
##  These columns will be ignored in the analysis.
##  We continue with the remaining 4 columns:
##  
## [1] V1  V2  V3  Vna
##  
##  The final data set we will analyze has 9 rows and 4 columns.
##

remX = DDCdatafr$remX; dim(remX)

## [1] 9 4

cellMap(DDCdatafr$stdResid)

plot of chunk unnamed-chunk-3

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.

Small generated dataset

set.seed(12345) # for reproducibility
n <- 50; d <- 20
A <- matrix(0.9, d, d); diag(A) = 1
round(A[1:10,1:10],1) # true covariance matrix

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
##  [1,]  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [2,]  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [3,]  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9  0.9   0.9
##  [4,]  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9  0.9   0.9
##  [5,]  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9  0.9   0.9
##  [6,]  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9  0.9   0.9
##  [7,]  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9  0.9   0.9
##  [8,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0  0.9   0.9
##  [9,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  1.0   0.9
## [10,]  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9  0.9   1.0

library(MASS) # only needed for the following line:
x <- mvrnorm(n, rep(0,d), A)
x[sample(1:(n * d), 50, FALSE)] <- NA
x[sample(1:(n * d), 50, FALSE)] <- 10
x[sample(1:(n * d), 50, FALSE)] <- -10

# When not specifying DDCpars in the call to DDC
# all defaults are used:
DDCx <- DDC(x)

##  
##  The input data has 50 rows and 20 columns.

cellMap(DDCx$stdResid)

plot of chunk unnamed-chunk-4

# Red cells have higher value than predicted, blue cells lower,
# white cells are missing values, all other cells are yellow.

DDCx$DDCpars # These are the default options:

## $fracNA
## [1] 0.5
## 
## $numDiscrete
## [1] 3
## 
## $precScale
## [1] 1e-12
## 
## $cleanNAfirst
## [1] "automatic"
## 
## $tolProb
## [1] 0.99
## 
## $corrlim
## [1] 0.5
## 
## $combinRule
## [1] "wmean"
## 
## $returnBigXimp
## [1] FALSE
## 
## $silent
## [1] FALSE
## 
## $nLocScale
## [1] 25000
## 
## $fastDDC
## [1] FALSE
## 
## $standType
## [1] "1stepM"
## 
## $corrType
## [1] "gkwls"
## 
## $transFun
## [1] "wrap"
## 
## $nbngbrs
## [1] 100

names(DDCx)

##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"   "namesNotNumeric"
##  [5] "namesCaseNumber" "namesNAcol"      "namesNArow"      "namesDiscrete"  
##  [9] "namesZeroScale"  "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"           "robcors"        
## [17] "robslopes"       "deshrinkage"     "Xest"            "scalestres"     
## [21] "stdResid"        "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"          "indNAs"         
## [29] "Ximp"

# We will now go through these outputs one by one:

DDCx$colInAnalysis # all columns X1,...,X20 remain:

##  X1  X2  X3  X4  X5  X6  X7  X8  X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20

DDCx$rowInAnalysis # all rows 1,...,50 remain:

##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 
## 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50

DDCx$namesNotNumeric # no non-numeric columns:

## NULL

DDCx$namesCaseNumber # no column was the case number:

## NULL

DDCx$namesNAcol # no columns with too many NA's:

## NULL

DDCx$namesNArow # no columns with too many NA's:

## NULL

DDCx$namesDiscrete # no discrete columns:

## NULL

DDCx$namesZeroScale # no columns with scale = 0:

## NULL

dim(DDCx$remX) # remaining data matrix

## [1] 50 20

round(DDCx$locX,2) # robust location estimates of all 20 columns:

##    X1    X2    X3    X4    X5    X6    X7    X8    X9   X10   X11   X12   X13 
## -0.22 -0.33 -0.37 -0.23 -0.31 -0.42 -0.40 -0.25 -0.24 -0.17 -0.34 -0.31 -0.42 
##   X14   X15   X16   X17   X18   X19   X20 
## -0.33 -0.26 -0.23 -0.40 -0.30 -0.35 -0.30

round(DDCx$scaleX,2) # robust scale estimates of all 20 columns:

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
## 1.32 1.36 1.12 1.15 1.23 1.35 1.07 1.45 1.04 0.96 1.47 1.37 1.22 1.40 1.40 1.42 
##  X17  X18  X19  X20 
## 1.47 1.31 1.44 1.35

round(DDCx$Z[1:10,1:10],1)

##      X1   X2   X3   X4   X5   X6   X7   X8   X9  X10
## 1  -0.3  0.1  0.1 -0.4  0.1 -0.4  0.2 -0.2 -0.1 -0.3
## 2   0.0 -0.1 -0.3 -0.4 -0.6 -7.1 -0.1 -0.8 -0.6 -0.3
## 3   0.1  0.3  0.1  0.6  0.7  0.4  0.5  0.4   NA  0.4
## 4   0.4  0.6  0.6   NA   NA  0.7  0.8 -6.7  0.8  0.2
## 5   0.2 -0.2   NA  0.0 -0.1 -0.1 -0.6  0.0  0.1 -0.4
## 6   1.6  1.6  2.0  1.8  1.5  1.6  1.6  1.4  2.2  1.8
## 7    NA -0.1 -0.1 -0.5 -0.2  0.1 -0.2  7.1  0.0 -0.5
## 8   0.5  0.6  0.3  0.6  0.4  0.0 -9.0  0.7  0.6  0.2
## 9   0.3  0.8  0.6  0.4  0.2  0.4  0.3  0.1  0.4  1.0
## 10  0.8  0.6  1.2  1.5  1.3  1.2   NA  0.6  1.1  0.6

# Robustly standardized dataset. Due to the high correlations,
# cells in the same row look similar (except for outlying cells).

DDCx$nbngbrs

## [1] 19

# For each column the code looked for up to 19 non-self neighbors (highly correlated columns).
# It goes through all of them, unless fastDDC is set to TRUE.

DDCx$ngbrs[1:3,]

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1   14   17    4   12    3   16    8   20    18    19     5    11    15
## [2,]    2   17   18   12   11   15   16   10   13     1     4     6     3    19
## [3,]    3   12   15    1   17    6   11    5    7     4    18    16    10     2
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]     6     2     9    10     7    13
## [2,]    14     7     5     9    20     8
## [3,]    14    19     8    20     9    13

# Shows the neighbors, e.g. the nearest non-self neighbor of X1 is X11, then X2,...

round(DDCx$robcors[1:3,],2)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1 0.96 0.94 0.93 0.93 0.93 0.93 0.93 0.93  0.93  0.93  0.92  0.92  0.91
## [2,]    1 0.96 0.94 0.93 0.93 0.92 0.92 0.92 0.91  0.91  0.91  0.91  0.90  0.90
## [3,]    1 0.96 0.93 0.93 0.92 0.92 0.92 0.91 0.91  0.91  0.91  0.91  0.90  0.90
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.91  0.91  0.91  0.91  0.90  0.89
## [2,]  0.89  0.89  0.88  0.88  0.87  0.85
## [3,]  0.90  0.90  0.90  0.90  0.89  0.89

# Robust correlations with these neighbors. In each row the correlations
# are sorted by decreasing absolute value. 

round(DDCx$robslopes[1:3,],2)

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    1 0.99 0.99 0.85 1.03 0.76 1.02 1.02 0.93  0.95  0.93  0.84  0.93  0.99
## [2,]    1 1.00 0.91 0.99 0.91 0.94 0.99 0.77 0.85  0.94  0.82  0.88  0.76  0.94
## [3,]    1 1.18 1.17 1.16 1.07 1.07 1.04 1.01 0.91  0.97  1.14  1.15  0.79  1.08
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]  0.91  0.88  0.70  0.70  0.73  0.82
## [2,]  0.94  0.76  0.85  0.66  0.84  0.89
## [3,]  1.13  1.16  1.07  1.06  0.83  1.02

# For each column, the slope of each neighbor predicting it.
# For instance, X1 is predicted by its first neighbor with 
# slope 0.97 and by its second neighbor with slope 0.81 .

round(DDCx$deshrinkage,2)

##   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15  X16 
## 1.09 1.08 1.08 1.10 1.10 1.08 1.08 1.10 1.11 1.13 1.10 1.07 1.09 1.09 1.08 1.06 
##  X17  X18  X19  X20 
## 1.10 1.06 1.10 1.11

# For each column, the factor by which its prediction is multiplied.

round(DDCx$Xest[1:12,1:10],2) # the estimated cells of remX:

##       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1  -0.48 -0.58 -0.61 -0.48 -0.55 -0.69 -0.63 -0.52 -0.49 -0.41
## 2  -0.53 -0.63 -0.67 -0.54 -0.63 -0.75 -0.69 -0.59 -0.55 -0.46
## 3   0.22  0.12  0.07  0.22  0.14  0.04  0.02  0.23  0.20  0.26
## 4   0.59  0.50  0.44  0.58  0.51  0.44  0.35  0.63  0.56  0.61
## 5  -0.45 -0.58 -0.61 -0.46 -0.54 -0.66 -0.63 -0.50 -0.46 -0.40
## 6   1.77  1.68  1.61  1.71  1.66  1.66  1.46  1.86  1.75  1.74
## 7  -0.42 -0.53 -0.56 -0.43 -0.51 -0.62 -0.59 -0.46 -0.42 -0.36
## 8   0.34  0.23  0.17  0.32  0.25  0.14  0.10  0.35  0.31  0.36
## 9   0.36  0.27  0.20  0.34  0.26  0.17  0.13  0.35  0.33  0.40
## 10  0.97  0.86  0.80  0.95  0.89  0.82  0.70  1.02  0.92  0.97
## 11  0.18  0.07  0.03  0.16  0.09 -0.01 -0.03  0.17  0.14  0.21
## 12 -1.62 -1.73 -1.73 -1.58 -1.69 -1.83 -1.68 -1.71 -1.59 -1.52

round(DDCx$stdResid[1:12,1:10],1)

##      X1    X2   X3   X4   X5    X6    X7    X8    X9  X10
## 1  -0.5   1.1  1.4 -0.6  1.1  -0.7   1.3  -0.1   0.4 -0.3
## 2   0.9   0.5 -0.1 -0.4 -1.5 -25.3   0.7  -2.0  -0.9  0.2
## 3  -1.0   0.0 -1.1  0.6  1.2   0.2   0.3   0.2    NA -0.3
## 4  -0.8   0.1 -0.7   NA   NA   0.4   0.1 -28.1   0.2 -2.1
## 5   1.5  -0.2   NA  0.8  0.2   0.5  -1.1   0.7   1.0 -0.6
## 6   0.5   0.5  0.8  0.4 -0.4   0.1  -0.6  -0.2   0.8 -0.6
## 7    NA   0.2  0.1 -1.1 -0.3   0.8  -0.1  27.6   0.5 -1.0
## 8   0.3   0.8 -0.6  0.5 -0.1  -1.4 -30.3   0.9   0.2 -1.1
## 9  -0.3   1.4  0.2 -0.4 -0.9   0.0  -0.6  -1.4  -0.5  1.6
## 10 -0.3  -0.8  0.5  1.5  1.1   0.9    NA  -1.0  -0.1 -1.9
## 11   NA  -0.2  1.1 -1.4 -1.0   0.1   0.5   0.1  -0.8 -1.7
## 12 -2.3 -21.1  0.7 -0.6  0.1   1.7   0.8  -0.4 -22.1 -2.0

# The standardized residuals of the cells. Note the NA's and some
# large positive and negative cell residuals.

qqnorm(as.vector(DDCx$stdResid)) # Note the far outliers on both sides:

plot of chunk unnamed-chunk-6

as.vector(DDCx$indcells) # indices of the cells that were flagged by DDC:

##   [1]  27  34  35  41  62  76  79  80  90  93 100 114 142 146 164 178 180 195
##  [19] 197 219 225 237 245 252 267 273 281 284 292 297 299 308 342 354 357 367
##  [37] 388 396 397 412 430 433 439 465 482 506 529 534 548 553 561 564 566 585
##  [55] 594 597 628 631 643 648 654 663 669 675 685 696 699 704 705 715 728 736
##  [73] 754 757 758 760 784 806 816 817 831 836 843 868 881 886 887 896 904 905
##  [91] 910 921 925 942 950 956 978 981 992 996

plot(DDCx$Ti) # the Ti values of the rows. None are high.

plot of chunk unnamed-chunk-6

qqnorm(DDCx$Ti)

plot of chunk unnamed-chunk-6

DDCx$medTi # median of the raw Ti (used to standardize Ti):

## [1] 0.001298725

DDCx$madTi # median absolute deviation of the raw Ti (used to standardize Ti):

## [1] 0.06241016

DDCx$indrows # numeric(0) means no rows are flagged:

## numeric(0)

as.vector(DDCx$indall) # indices of the flagged cells, including those in flagged rows:

##   [1]  27  34  35  41  62  76  79  80  90  93 100 114 142 146 164 178 180 195
##  [19] 197 219 225 237 245 252 267 273 281 284 292 297 299 308 342 354 357 367
##  [37] 388 396 397 412 430 433 439 465 482 506 529 534 548 553 561 564 566 585
##  [55] 594 597 628 631 643 648 654 663 669 675 685 696 699 704 705 715 728 736
##  [73] 754 757 758 760 784 806 816 817 831 836 843 868 881 886 887 896 904 905
##  [91] 910 921 925 942 950 956 978 981 992 996

as.vector(DDCx$indNAs) # indices of the missing cells:

##  [1]   7  11  28  44  97 105 154 174 176 184 204 249 264 310 348 403 427 443 489
## [20] 491 517 523 550 556 589 602 606 609 615 624 645 646 658 700 725 798 803 814
## [39] 867 870 945 954 979 980 999

round(DDCx$Ximp[1:10,1:10],2)

##       X1    X2    X3    X4    X5    X6    X7    X8    X9   X10
## 1  -0.66 -0.16 -0.22 -0.66 -0.22 -0.94 -0.20 -0.57 -0.32 -0.49
## 2  -0.21 -0.45 -0.69 -0.66 -1.09 -0.75 -0.46 -1.36 -0.90 -0.41
## 3  -0.13  0.13 -0.24  0.43  0.51  0.11  0.11  0.31  0.20  0.19
## 4   0.32  0.53  0.25  0.58  0.51  0.58  0.40  0.63  0.64  0.06
## 5   0.07 -0.65 -0.61 -0.20 -0.48 -0.49 -1.01 -0.24 -0.10 -0.56
## 6   1.93  1.89  1.85  1.85  1.55  1.69  1.27  1.77  2.07  1.59
## 7  -0.42 -0.46 -0.54 -0.78 -0.60 -0.34 -0.62 -0.46 -0.22 -0.63
## 8   0.44  0.53  0.01  0.47  0.23 -0.37  0.10  0.71  0.37  0.07
## 9   0.24  0.81  0.26  0.21 -0.03  0.16 -0.06 -0.16  0.13  0.81
## 10  0.88  0.55  0.94  1.45  1.24  1.17  0.70  0.64  0.88  0.46

# The imputed matrix. Both the cellwise outliers and the missing values
# are replaced by their predicted values.

round((DDCx$Ximp - DDCx$remX)[1:10,1:10],2)

##    X1 X2 X3 X4 X5   X6   X7     X8 X9 X10
## 1   0  0  0  0  0 0.00  0.0   0.00  0   0
## 2   0  0  0  0  0 9.25  0.0   0.00  0   0
## 3   0  0  0  0  0 0.00  0.0   0.00 NA   0
## 4   0  0  0 NA NA 0.00  0.0  10.63  0   0
## 5   0  0 NA  0  0 0.00  0.0   0.00  0   0
## 6   0  0  0  0  0 0.00  0.0   0.00  0   0
## 7  NA  0  0  0  0 0.00  0.0 -10.46  0   0
## 8   0  0  0  0  0 0.00 10.1   0.00  0   0
## 9   0  0  0  0  0 0.00  0.0   0.00  0   0
## 10  0  0  0  0  0 0.00   NA   0.00  0   0

# The nonzero values and the NA's correspond to imputed cells.

TopGear dataset

The Top Gear data contains information on 297 cars.

library(robustHD)
data(TopGear)

dim(TopGear)

## [1] 297  32

rownames(TopGear)[1:13] # "1" to "297" are not useful names

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13"

rownames(TopGear) = paste(TopGear[,1],TopGear[,2]) 
# Now the rownames are make and model of the cars.
rownames(TopGear)[165] = "Mercedes-Benz G" # name was too long
myTopGear = TopGear[,-31] # removes the subjective variable `Verdict'

# Transform some variables to get roughly gaussianity in the center:
transTG = myTopGear
transTG$Price = log(myTopGear$Price)
transTG$Displacement = log(myTopGear$Displacement)
transTG$BHP = log(myTopGear$BHP)
transTG$Torque = log(myTopGear$Torque)
transTG$TopSpeed = log(myTopGear$TopSpeed)

# Run the DDC method:
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCtransTG = DDC(transTG,DDCpars)

##  
##  The final data set we will analyze has 296 rows and 11 columns.
##

# With DDCpars = list(fastDDC = FALSE, silent = FALSE) we obtain more information:
# 
# The input data has 297 rows and 31 columns.
# 
# The input data contained 19 non-numeric columns (variables).
# Their column names are:
#   
#  [1] Maker         Model              Type               Fuel              
#  [5] DriveWheel    AdaptiveHeadlights AdjustableSteering AlarmSystem       
#  [9] Automatic     Bluetooth          ClimateControl     CruiseControl     
# [13] ElectricSeats Leather            ParkingSensors     PowerSteering     
# [17] SatNav        ESP                Origin            
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 12 numeric columns:
#   
# [1] Price Cylinders Displacement BHP   Torque Acceleration TopSpeed    
# [8] MPG   Weight    Length       Width Height      
# 
# The data contained 1 rows with over 50% of NAs.
# Their row names are:
#   
#   [1] Citroen C5 Tourer
# 
# These rows will be ignored in the analysis.
# We continue with the remaining 296 rows:
#   
#   [1] Alfa Romeo Giulietta             Alfa Romeo MiTo                 
# .......
# [295] Volvo XC70                       Volvo XC90                      
# 
# The data contained 1 columns with zero or tiny median absolute deviation.
# Their column names are:
# 
# [1] Cylinders
# 
# These columns will be ignored in the analysis.
# We continue with the remaining 11 columns:
# 
# [1] Price  Displacement BHP   Torque Acceleration TopSpeed MPG         
# [8] Weight Length       Width Height      
# 
# The final data set we will analyze has 296 rows and 11 columns.

remX = DDCtransTG$remX # the remaining part of the dataset
dim(remX)

## [1] 296  11

colSums(is.na(remX)) # There are still NAs, mainly in `Weight':

##        Price Displacement          BHP       Torque Acceleration     TopSpeed 
##            0            8            3            3            0            3 
##          MPG       Weight       Length        Width       Height 
##           11           32           10           15           10

# Analyze the data by column:
standX = scale(remX,apply(remX,2,median,na.rm = TRUE), 
               apply(remX,2,mad,na.rm = TRUE))
dim(standX)

## [1] 296  11

round(standX[1:5,],1) # has NAs where remX does

##                          Price Displacement  BHP Torque Acceleration TopSpeed
## Alfa Romeo Giulietta      -0.4         -0.4 -0.7    0.0          0.6     -0.5
## Alfa Romeo MiTo           -0.9         -0.7 -0.7   -1.6          0.4     -0.4
## Aston Martin Cygnet        0.3         -0.8 -0.8   -1.7          0.7     -0.9
## Aston Martin DB9           2.7          2.1  1.9    1.2         -1.2      2.0
## Aston Martin DB9 Volante   2.8          2.1  1.9    1.2         -1.2      2.0
##                           MPG Weight Length Width Height
## Alfa Romeo Giulietta      1.0   -0.3   -0.3  -0.2   -0.2
## Alfa Romeo MiTo           0.1   -1.0   -0.9  -1.1   -0.3
## Aston Martin Cygnet       0.6   -1.3   -3.1  -1.5    0.1
## Aston Martin DB9         -1.7    0.8    0.6    NA   -1.6
## Aston Martin DB9 Volante -1.7    1.0    0.6    NA   -1.6

transTGcol = remX
transTGcol[abs(standX) > sqrt(qchisq(0.99,1))] = NA
round(transTGcol[1:5,],1) # has NAs in outlying cells as well:

##                          Price Displacement BHP Torque Acceleration TopSpeed
## Alfa Romeo Giulietta      10.0          7.4 4.7    5.5         11.3      4.7
## Alfa Romeo MiTo            9.6          7.2 4.7    4.6         10.7      4.8
## Aston Martin Cygnet       10.3          7.2 4.6    4.5         11.8      4.7
## Aston Martin DB9            NA          8.7 6.2    6.1          4.6      5.2
## Aston Martin DB9 Volante    NA          8.7 6.2    6.1          4.6      5.2
##                          MPG Weight Length Width Height
## Alfa Romeo Giulietta      64   1385   4351  1798   1465
## Alfa Romeo MiTo           49   1090   4063  1720   1446
## Aston Martin Cygnet       56    988     NA  1680   1500
## Aston Martin DB9          19   1785   4720    NA   1282
## Aston Martin DB9 Volante  19   1890   4720    NA   1282

# Make untransformed submatrix of X for labeling the cells in the plot:
tempX = myTopGear[DDCtransTG$rowInAnalysis,DDCtransTG$colInAnalysis]
tempX$Price = tempX$Price/1000 # to avoid printing long numbers
dim(tempX)

## [1] 296  11

# Show the following 17 cars in the cellmap:
showrows = c(12,42,56,73,81,94,99,135,150,164,176,198,209,215,234,241,277)

# Make two ggplot2 objects:
ggpcol = cellMap(standX, showcellvalues="D", D=tempX, 
                 mTitle="By column", showrows=showrows,
                 sizecellvalues = 0.6, adjustrowlabels=0.5)  
plot(ggpcol)

plot of chunk unnamed-chunk-9

ggpDDC = cellMap(DDCtransTG$stdResid, showcellvalues="D", D=tempX,  
                 mTitle="DetectDeviatingCells", showrows=showrows,
                 sizecellvalues = 0.6, adjustrowlabels=0.5)  
plot(ggpDDC)

plot of chunk unnamed-chunk-10

# Creating the pdf:
# pdf("cellMap_TopGear.pdf", width = 12, height = 10)
# gridExtra::grid.arrange(ggpcol,ggpDDC,nrow=1) # combines 2 plots in a figure
# dev.off()

Analyzing new data by DDCpredict

We now consider the 17 cars shown in the cellmap as a `new’ dataset.

# Top Gear dataset: prediction of "new" data
############################################
# For comparison we first remake the cell map of the entire dataset, but now 
# showing the values of the residuals instead of the data values:

dim(remX) # 296 11

## [1] 296  11

ggpDDC = cellMap(DDCtransTG$stdResid, showcellvalues="R", 
                 sizecellvalues = 0.7, mTitle="DetectDeviatingCells", 
                 showrows=showrows, adjustrowlabels=0.5)
plot(ggpDDC)

plot of chunk unnamed-chunk-11

Define the “initial” dataset as the rows not in these 17:

initX = remX[-showrows,]
dim(initX) # 279 11

## [1] 279  11

# Fit initX:
DDCinitX = DDC(initX,DDCpars=DDCpars)

##  
##  The final data set we will analyze has 278 rows and 11 columns.
##

Define the “new” dataset, and apply DDCpredict to it:

newX = remX[showrows,]
dim(newX) # 17  11

## [1] 17 11

# Make predictions by DDCpredict. 
# Its inputs are:
#   Xnew       : the new data (test data)
#   InitialDDC : Must be provided.
#   DDCpars    : the input options to be used for the prediction.
#                By default the options of InitialDDC are used. 

predictDDC = DDCpredict(newX,DDCinitX)

names(DDCinitX)

##  [1] "DDCpars"         "colInAnalysis"   "rowInAnalysis"   "namesNotNumeric"
##  [5] "namesCaseNumber" "namesNAcol"      "namesNArow"      "namesDiscrete"  
##  [9] "namesZeroScale"  "remX"            "locX"            "scaleX"         
## [13] "Z"               "nbngbrs"         "ngbrs"           "robcors"        
## [17] "robslopes"       "deshrinkage"     "Xest"            "scalestres"     
## [21] "stdResid"        "indcells"        "Ti"              "medTi"          
## [25] "madTi"           "indrows"         "indall"          "indNAs"         
## [29] "Ximp"

# For comparison with:

names(predictDDC) # Fewer, since DDCpredict does not call checkDataSet:

##  [1] "DDCpars"     "locX"        "scaleX"      "Z"           "nbngbrs"    
##  [6] "ngbrs"       "robcors"     "robslopes"   "deshrinkage" "Xest"       
## [11] "scalestres"  "stdResid"    "indcells"    "Ti"          "medTi"      
## [16] "madTi"       "indrows"     "indNAs"      "indall"      "Ximp"

# If you specify the parameters the result is the same:
predictDDC2 = DDCpredict(newX,DDCinitX,DDCpars=DDCpars)
all.equal(predictDDC,predictDDC2) # TRUE

## [1] TRUE

ggpnew = cellMap(predictDDC$stdResid, showcellvalues="R",
                 sizecellvalues = 0.7, mTitle="DDCpredict",
                 adjustrowlabels=0.5) 
plot(ggpnew) # Looks quite similar to the result using the entire dataset:

plot of chunk unnamed-chunk-14

# Creating the pdf:
# pdf("TopGear_DDCpredict.pdf",width=12,height=10)
# gridExtra::grid.arrange(ggpDDC,ggpnew,nrow=1)
# dev.off()

Philips data

The philips data contains 9 measurements of TV parts from a production line.

data(data_philips)
dim(data_philips)

## [1] 677   9

colnames(data_philips) = c("X1","X2","X3","X4","X5","X6","X7","X8","X9")

DDCphilips = DDC(data_philips)

##  
##  The input data has 677 rows and 9 columns.

qqnorm(as.vector(DDCphilips$Z)) # rather gaussian, here we only see 2 outliers:

plot of chunk unnamed-chunk-15

round(DDCphilips$stdResid[1:12,],1) # the standardized residuals:

##      X1   X2   X3   X4   X5   X6  X7   X8   X9
## 1  -0.9  2.2  0.0  1.2  0.8 -2.9 2.1  0.1 -1.4
## 2  -0.5  2.0  0.0  3.9  0.8 -1.1 3.5 -0.4 -1.7
## 3  -1.0  2.3 -0.4  0.7  0.6 -2.0 1.4  0.0 -1.7
## 4  -0.6  2.3 -0.1  3.8  1.1 -0.9 3.4 -0.5 -2.0
## 5  -0.6  2.8 -0.2  3.4  0.9 -0.9 3.4  0.2 -1.9
## 6  -0.7  2.6  0.0  4.4  1.0 -1.1 4.2 -0.1 -2.0
## 7   1.0 -0.1 -1.0 -0.3 -1.2 -1.0 1.4  0.8 -0.6
## 8  -0.4 -1.4 -0.7  1.8 -0.7 -2.8 1.9 -0.3 -1.7
## 9   1.0 -0.2  0.0 -0.1 -1.2 -2.1 1.6  1.0 -0.9
## 10  0.2 -1.3 -0.1  1.0 -0.5 -2.9 2.0 -0.1 -1.5
## 11 -0.1 -0.3 -0.1  0.2 -1.1 -1.5 1.9  0.8 -0.8
## 12 -0.1 -0.4 -1.5  2.5  0.0 -3.4 2.3  0.3 -2.5

DDCphilips$indcells # indices of the cells that were flagged:

##   [1]   55   70   95  104  116  144  151  156  161  170  174  175  324  468  682
##  [16]  683  712  747  750  792  985  986  987  988  989  991  992 1006 1007 1008
##  [31] 1010 1011 1196 1199 1450 1452 1455 1474 1529 1787 2033 2035 2036 2037 2045
##  [46] 2047 2094 2101 2114 2116 2126 2135 2267 2312 2355 2365 2464 2499 2592 2743
##  [61] 2775 2777 3120 3216 3226 3227 3228 3248 3386 3393 3395 3397 3399 3401 3412
##  [76] 3413 3414 3415 3416 3417 3423 3424 3425 3426 3427 3428 3430 3431 3437 3445
##  [91] 3456 3457 3460 3464 3465 3466 3468 3469 3470 3990 4064 4066 4067 4068 4076
## [106] 4078 4088 4145 4147 4792 4794 5036 5037 5168 5169 5170 5171 5172 5174 5175
## [121] 5176 5230 5238 5241 5246 5248 5250 5253 5256 5258 5259 5260 5262 5263 5265
## [136] 5267 5268 5270 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5569
## [151] 5869 5873 5882 5890 5902

DDCphilips$indrows # flagged rows:

## [1]  26  56  84 334 491

We also apply the rowwise method MCD to detect outlying rows:

library(robustbase) # for covMcd

MCDphilips = robustbase::covMcd(data_philips)
indrowsMCD = which(mahalanobis(data_philips,MCDphilips$center,
                               MCDphilips$cov) > qchisq(0.975,df=9))

plot(sqrt(mahalanobis(data_philips,MCDphilips$center,MCDphilips$cov)),
     main="Philips data",ylab="Robust distances",xlab="",pch=20)
abline(h=sqrt(qchisq(0.975,df=9))) # this horizontal line is the cutoff.

plot of chunk unnamed-chunk-17

# dev.copy(pdf,"Figure_philips_left.pdf",width=10,height=4)
# dev.off()

# cellMaps with rectangular blocks:

ggpMCDphilips = cellMap(data_philips, indrows=indrowsMCD,
                        mTitle="MCD", nrowsinblock=15,
                        ncolumnsinblock=1, drawCircles = TRUE)

## No rowblocklabels were given, so they are constructed automatically.

plot(ggpMCDphilips)

plot of chunk unnamed-chunk-18

ggpDDCphilips = cellMap(DDCphilips$stdResid, indrows=DDCphilips$indrows,
                        mTitle="DetectDeviatingCells", nrowsinblock=15,
                        ncolumnsinblock=1, drawCircles = TRUE)

## No rowblocklabels were given, so they are constructed automatically.

plot(ggpDDCphilips)

plot of chunk unnamed-chunk-18

# dev.copy(pdf,"Figure_philips_right.pdf",width=6,height=12)
# dev.off()

Mortality dataset

This dataset contains the mortality of French males in the years 1816 to 2013.

data(data_mortality)
dim(data_mortality)

## [1] 198  91

# 198  91
rownames(data_mortality)[1:5]

## [1] "1816" "1817" "1818" "1819" "1820"

colnames(data_mortality)[1:5]

## [1] "0" "1" "2" "3" "4"

DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCmortality = DDC(data_mortality,DDCpars) # 1 second

remX = DDCmortality$remX
dim(remX)

## [1] 198  91

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA

PCAmortality = rrcov::PcaHubert(data_mortality,alpha=0.75,scale=FALSE)

ggpROBPCA = cellMap(remX, indrows=which(PCAmortality@flag==FALSE),
                    mTitle="By row", nrowsinblock=5,
                    ncolumnsinblock=5, rowtitle = "Years",
                    columntitle = "Age", sizetitles = 1.5,
                    drawCircles = TRUE)

## No rowblocklabels were given, so they are constructed automatically.
## No columnblocklabels were given, so they are constructed automatically.

plot(ggpROBPCA)

plot of chunk unnamed-chunk-21

ggpDDC = cellMap(DDCmortality$stdResid, 
                 mTitle="DetectDeviatingCells", nrowsinblock=5,
                 ncolumnsinblock=5, rowtitle = "Years",
                 columntitle = "Age", sizetitles = 1.5)

## No rowblocklabels were given, so they are constructed automatically.
## No columnblocklabels were given, so they are constructed automatically.

plot(ggpDDC) # Leads to a detailed interpretation:

plot of chunk unnamed-chunk-21

# pdf("cellmap_mortality.pdf",width=9,height=8)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,nrow=1)
# dev.off()

To illustrate the ability to make row blocks and column blocks of arbitrary sizes, we create the following artificial groupings:

rowblocksizes = c(84,14,5,21,6,35,33)
rowlabels = c("19th century","1900-1913","WW1","1919-1939",
"WW2","1946-1980","recent")
colblocksizes = c(5,5,10,10,10,10,10,10,10,10,1)
collabels = c("upto 4","5 to 9","10 to 19","20 to 29","30 to 39","40 to 49","50 to 59","60 to 69","70 to 79","80 to 89","90+")

ggpDDC = cellMap(DDCmortality$stdResid,
                 mTitle = "Cellmap with manual blocks",
                 manualrowblocksizes = rowblocksizes,
                 rowblocklabels = rowlabels,
                 manualcolumnblocksizes = colblocksizes,
                 columnblocklabels = collabels,
                 rowtitle = "Epochs",
                 columntitle = "Age groups",
                 sizetitles = 1.2)
# pdf("cellmap_mortality_manual_blocks.pdf",width=5,height=3)
plot(ggpDDC)

plot of chunk unnamed-chunk-22

# dev.off()

Glass dataset

The glass data consists of spectra with 750 wavelengths of 180 archaeological glass samples.

data(data_glass)
DDCpars = list(fastDDC = FALSE, silent = TRUE)
DDCglass = DDC(data_glass,DDCpars) # takes 8 seconds

##  
##  The final data set we will analyze has 180 rows and 737 columns.
##

remX = DDCglass$remX
# With DDCpars$silent = FALSE we obtain more information:
#
#  The input data has 180 rows and 750 columns.
#
#  The data contained 11 discrete columns with 3 or fewer values.
#  Their column names are:
#
#  [1] V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 739 columns:
#  
#   [1] V12  V13  V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25
#    ......
# [729] V740 V741 V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The data contained 2 columns with zero or tiny median absolute deviation.
#  Their column names are:
#
# [1] V12 V13
#
#  These columns will be ignored in the analysis.
#  We continue with the remaining 737 columns:
#
#   [1] V14  V15  V16  V17  V18  V19  V20  V21  V22  V23  V24  V25  V26  V27
#    ......
# [729] V742 V743 V744 V745 V746 V747 V748 V749 V750
#
#  The final data set we will analyze has 180 rows and 737 columns.

dim(remX)

## [1] 180 737

We will compare this with the faster approximate algorithm of DDC, obtained by the option fastDDC=TRUE:

fastDDCpars = list(fastDDC = TRUE, silent = TRUE)
fastDDCglass = DDC(data_glass, fastDDCpars) # takes 2 seconds

##  
##  The final data set we will analyze has 180 rows and 737 columns.
##

remXfast = fastDDCglass$remX
all.equal(remX,remXfast) # The remaining data is the same:

## [1] TRUE

We also apply the rowwise method ROBPCA to detect outlying rows:

library(rrcov) # contains ROBPCA

PCAglass = rrcov::PcaHubert(remX,alpha=0.75,scale=FALSE)

n = nrow(remX)
nrowsinblock = 5
rowtitle = "glass samples"
rowlabels = rep("",floor(n/nrowsinblock));
rowlabels[1] = "1"
rowlabels[floor(n/nrowsinblock)] = "n";

d = ncol(remX)
ncolumnsinblock = 5
columntitle = "wavelengths"
columnlabels = rep("",floor(d/ncolumnsinblock));
columnlabels[1] = "1";
columnlabels[floor(d/ncolumnsinblock)] = "d"

ggpROBPCA = cellMap(matrix(0,n,d),
                    indrows=which(PCAglass@flag==FALSE),
                    rowblocklabels=rowlabels,
                    columnblocklabels=columnlabels,
                    mTitle="By row", nrowsinblock=5,
                    ncolumnsinblock=5,
                    rowtitle="glass samples",
                    columntitle="wavelengths",
                    sizetitles=1.2,
                    columnangle=0,
                    drawCircles = TRUE)
plot(ggpROBPCA)

plot of chunk unnamed-chunk-26

ggpDDC = cellMap(DDCglass$stdResid,
                 indrows=DDCglass$indrows,
                 rowblocklabels=rowlabels,
                 columnblocklabels=columnlabels,
                 mTitle="DDC", nrowsinblock=5,
                 ncolumnsinblock=5,
                 rowtitle="glass samples",
                 columntitle="wavelengths",
                 sizetitles=1.2,
                 columnangle=0,
                 drawCircles = TRUE)
plot(ggpDDC)

plot of chunk unnamed-chunk-26

# pdf("cellmap_glass_ROBPCA_DDC.pdf",width=8,height=6)
# gridExtra::grid.arrange(ggpROBPCA,ggpDDC,ncol=1)
# dev.off()

ggpfastDDC = cellMap(fastDDCglass$stdResid,
                     indrows=fastDDCglass$indrows,
                     rowblocklabels=rowlabels,
                     columnblocklabels=columnlabels,
                     mTitle="fastDDC", nrowsinblock=5,
                     ncolumnsinblock=5,
                     rowtitle="glass samples",
                     columntitle="wavelengths",
                     sizetitles=1.2,
                     columnangle=0,
                     drawCircles = TRUE)
plot(ggpfastDDC)

plot of chunk unnamed-chunk-26

# pdf("cellmap_glass_DDC_fastDDC.pdf",width=8,height=6)
# gridExtra::grid.arrange(ggpDDC,ggpfastDDC,ncol=1)
# dev.off()

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.

DDC examples

Raymaekers, J. and Rousseeuw, P.J.

2023-10-25

Introduction

Example with row and column selection

Small generated dataset

TopGear dataset

Analyzing new data by DDCpredict

Define the “initial” dataset as the rows not in these 17:

Define the “new” dataset, and apply DDCpredict to it:

Philips data

We also apply the rowwise method MCD to detect outlying rows:

Mortality dataset

We also apply the rowwise method ROBPCA to detect outlying rows:

Glass dataset

We will compare this with the faster approximate algorithm of DDC, obtained by the option fastDDC=TRUE:

We also apply the rowwise method ROBPCA to detect outlying rows: