vtreat is a package that prepares arbitrary data frames into clean data frames that are ready for analysis. A clean data frame:

To achieve this a number of techniques are used. Principally:

For more details see: the vtreat article

The main pattern is the use of designTreatmentsC() or designTreatmentsN() to design a treatment plan and then use the returned structure with prepare() to apply the plan to data frames. The main feature of vtreat is all data preparation is “y-aware” or uses the relations of effective variables to the dependent or outcome variable to encode the effective variables.

The structure returned from designTreatmentsN() or designTreatmentsC() includes informational fields. The main fields are mostly vectors with names (all with the same names in the same order):

In additon designTreatmentsC() returns one more vector

In all cases we have two upward biases on the scores

An example is:

library(vtreat)
dTrainC <- data.frame(x=c('a','a','a','b','b',NA),
   z=c(1,2,3,4,NA,6),y=c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE))
head(dTrainC)
##      x  z     y
## 1    a  1 FALSE
## 2    a  2 FALSE
## 3    a  3  TRUE
## 4    b  4 FALSE
## 5    b NA  TRUE
## 6 <NA>  6  TRUE
dTestC <- data.frame(x=c('a','b','c',NA),z=c(10,20,30,NA))
head(dTestC)
##      x  z
## 1    a 10
## 2    b 20
## 3    c 30
## 4 <NA> NA
treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
## [1] "desigining treatments Mon May 25 07:50:27 2015"
## [1] "design var x Mon May 25 07:50:27 2015"
## [1] "design var z Mon May 25 07:50:27 2015"
## [1] "scoring columns Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_lev_NA (derived from x ) Mon May 25 07:50:27 2015" 
## [2] "score variable(s) x_lev_x.a (derived from x ) Mon May 25 07:50:27 2015"
## [3] "score variable(s) x_lev_x.b (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_catB (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_clean (derived from z ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_isBAD (derived from z ) Mon May 25 07:50:27 2015"
## [1] "have treatment plan Mon May 25 07:50:27 2015"
print(treatmentsC)
## $treatments
## $treatments[[1]]
## [1] "vtreat 'Categoric Indicators'('x'->character->'x_lev_NA','x_lev_x.a','x_lev_x.b')"
## 
## $treatments[[2]]
## [1] "vtreat 'Bayesian Impact Code'('x'->character->'x_catB')"
## 
## $treatments[[3]]
## [1] "vtreat 'Scalable pass through'('z'->numeric->'z_clean')"
## 
## $treatments[[4]]
## [1] "vtreat 'is.bad'('z'->numeric->'z_isBAD')"
## 
## 
## $vars
## [1] "x_lev_NA"  "x_lev_x.a" "x_lev_x.b" "x_catB"    "z_clean"   "z_isBAD"  
## 
## $varScores
##  x_lev_NA x_lev_x.a x_lev_x.b    x_catB   z_clean   z_isBAD 
##  1.509383  1.999938  1.000000  1.374970  1.091783  1.509383 
## 
## $PRESSRsquared
##    x_lev_NA   x_lev_x.a   x_lev_x.b      x_catB     z_clean     z_isBAD 
## -0.50938261 -0.99993834  0.00000000 -0.37496957 -0.09178333 -0.50938261 
## 
## $varMoves
##  x_lev_NA x_lev_x.a x_lev_x.b    x_catB   z_clean   z_isBAD 
##      TRUE      TRUE     FALSE      TRUE      TRUE      TRUE 
## 
## $outcomename
## [1] "y"
## 
## $meanY
## [1] 0.5
## 
## $ndat
## [1] 6
## 
## $catPseudoRSquared
##      x_lev_NA     x_lev_x.a     x_lev_x.b        x_catB       z_clean 
##  3.330669e-16 -2.220446e-16  0.000000e+00  5.888770e-02 -4.748721e-02 
##       z_isBAD 
##  3.330669e-16 
## 
## attr(,"class")
## [1] "treatmentplan"
print(treatmentsC$treatments[[1]])
## [1] "vtreat 'Categoric Indicators'('x'->character->'x_lev_NA','x_lev_x.a','x_lev_x.b')"
dTrainCTreated <- prepare(treatmentsC,dTrainC,pruneLevel=c(),scale=TRUE)
head(dTrainCTreated)
##   x_lev_NA  x_lev_x.a      x_catB       z_clean z_isBAD     y
## 1     -0.1 -0.1666667 -0.18838870 -3.864865e-01    -0.1 FALSE
## 2     -0.1 -0.1666667 -0.18838870 -2.108108e-01    -0.1 FALSE
## 3     -0.1 -0.1666667 -0.18838870 -3.513514e-02    -0.1  TRUE
## 4     -0.1  0.1666667  0.05164882  1.405405e-01    -0.1 FALSE
## 5     -0.1  0.1666667  0.05164882 -2.220446e-16     0.5  TRUE
## 6      0.5  0.1666667  0.46186845  4.918919e-01    -0.1  TRUE
varsC <- setdiff(colnames(dTrainCTreated),'y')
# all input variables should be mean 0
sapply(dTrainCTreated[,varsC,drop=FALSE],mean)
##      x_lev_NA     x_lev_x.a        x_catB       z_clean       z_isBAD 
## -2.543809e-17  5.551115e-17  4.163336e-17 -1.942890e-16 -2.543922e-17
# all slopes should be 1
sapply(varsC,function(c) { lm(paste('y',c,sep='~'),
   data=dTrainCTreated)$coefficients[[2]]})
##  x_lev_NA x_lev_x.a    x_catB   z_clean   z_isBAD 
##         1         1         1         1         1
dTestCTreated <- prepare(treatmentsC,dTestC,pruneLevel=c(),scale=TRUE)
head(dTestCTreated)
##        x_lev_NA     x_lev_x.a        x_catB       z_clean z_isBAD
## 1 -1.000000e-01 -1.666667e-01 -1.883887e-01  4.918919e-01    -0.1
## 2 -1.000000e-01  1.666667e-01  5.164882e-02  4.918919e-01    -0.1
## 3 -2.775558e-17  5.551115e-17  4.857226e-17  4.918919e-01    -0.1
## 4  5.000000e-01  1.666667e-01  4.618685e-01 -2.220446e-16     0.5
# numeric example
dTrainN <- data.frame(x=c('a','a','a','a','b','b',NA),
   z=c(1,2,3,4,5,NA,7),y=c(0,0,0,1,0,1,1))
head(dTrainN)
##   x  z y
## 1 a  1 0
## 2 a  2 0
## 3 a  3 0
## 4 a  4 1
## 5 b  5 0
## 6 b NA 1
dTestN <- data.frame(x=c('a','b','c',NA),z=c(10,20,30,NA))
head(dTestN)
##      x  z
## 1    a 10
## 2    b 20
## 3    c 30
## 4 <NA> NA
treatmentsN = designTreatmentsN(dTrainN,colnames(dTrainN),'y')
## [1] "desigining treatments Mon May 25 07:50:27 2015"
## [1] "design var x Mon May 25 07:50:27 2015"
## [1] "design var z Mon May 25 07:50:27 2015"
## [1] "scoring columns Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_lev_NA (derived from x ) Mon May 25 07:50:27 2015" 
## [2] "score variable(s) x_lev_x.a (derived from x ) Mon May 25 07:50:27 2015"
## [3] "score variable(s) x_lev_x.b (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_catN (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_clean (derived from z ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_isBAD (derived from z ) Mon May 25 07:50:27 2015"
## [1] "have treatment plan Mon May 25 07:50:27 2015"
print(treatmentsN)
## $treatments
## $treatments[[1]]
## [1] "vtreat 'Categoric Indicators'('x'->character->'x_lev_NA','x_lev_x.a','x_lev_x.b')"
## 
## $treatments[[2]]
## [1] "vtreat 'Scalable Impact Code'('x'->character->'x_catN')"
## 
## $treatments[[3]]
## [1] "vtreat 'Scalable pass through'('z'->numeric->'z_clean')"
## 
## $treatments[[4]]
## [1] "vtreat 'is.bad'('z'->numeric->'z_isBAD')"
## 
## 
## $vars
## [1] "x_lev_NA"  "x_lev_x.a" "x_lev_x.b" "x_catN"    "z_clean"   "z_isBAD"  
## 
## $varScores
##  x_lev_NA x_lev_x.a x_lev_x.b    x_catN   z_clean   z_isBAD 
## 1.3958270 1.6527526 2.2591409 1.1111135 0.9911108 1.3958270 
## 
## $PRESSRsquared
##     x_lev_NA    x_lev_x.a    x_lev_x.b       x_catN      z_clean 
## -0.395826995 -0.652752554 -1.259140904 -0.111113490  0.008889188 
##      z_isBAD 
## -0.395826995 
## 
## $varMoves
##  x_lev_NA x_lev_x.a x_lev_x.b    x_catN   z_clean   z_isBAD 
##      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE 
## 
## $outcomename
## [1] "y"
## 
## $meanY
## [1] 0.4285714
## 
## $ndat
## [1] 7
## 
## attr(,"class")
## [1] "treatmentplan"
dTrainNTreated <- prepare(treatmentsN,dTrainN,
                          pruneLevel=c(),scale=TRUE)
head(dTrainNTreated)
##     x_lev_NA  x_lev_x.a   x_lev_x.b      x_catN     z_clean    z_isBAD y
## 1 -0.0952381 -0.1785714 -0.02857143 -0.17857143 -0.41904762 -0.0952381 0
## 2 -0.0952381 -0.1785714 -0.02857143 -0.17857143 -0.26190476 -0.0952381 0
## 3 -0.0952381 -0.1785714 -0.02857143 -0.17857143 -0.10476190 -0.0952381 0
## 4 -0.0952381 -0.1785714 -0.02857143 -0.17857143  0.05238095 -0.0952381 1
## 5 -0.0952381  0.2380952  0.07142857  0.07142857  0.20952381 -0.0952381 0
## 6 -0.0952381  0.2380952  0.07142857  0.07142857  0.00000000  0.5714286 1
varsN <- setdiff(colnames(dTrainNTreated),'y')
# all input variables should be mean 0
sapply(dTrainNTreated[,varsN,drop=FALSE],mean) 
##      x_lev_NA     x_lev_x.a     x_lev_x.b        x_catN       z_clean 
## -7.930164e-17 -5.551115e-17 -6.393743e-17 -2.379049e-17  4.757324e-17 
##       z_isBAD 
## -7.929874e-17
# all slopes should be 1
sapply(varsN,function(c) { lm(paste('y',c,sep='~'),
   data=dTrainNTreated)$coefficients[[2]]}) 
##  x_lev_NA x_lev_x.a x_lev_x.b    x_catN   z_clean   z_isBAD 
##         1         1         1         1         1         1
dTestNTreated <- prepare(treatmentsN,dTestN,
                         pruneLevel=c(),scale=TRUE)
head(dTestNTreated)
##        x_lev_NA     x_lev_x.a     x_lev_x.b        x_catN   z_clean
## 1 -9.523810e-02 -1.785714e-01 -2.857143e-02 -1.785714e-01 0.5238095
## 2 -9.523810e-02  2.380952e-01  7.142857e-02  7.142857e-02 0.5238095
## 3 -8.326673e-17 -5.551115e-17 -6.591949e-17 -2.610216e-17 0.5238095
## 4  5.714286e-01  2.380952e-01 -2.857143e-02  5.714286e-01 0.0000000
##      z_isBAD
## 1 -0.0952381
## 2 -0.0952381
## 3 -0.0952381
## 4  0.5714286