vtreat is a package that prepares arbitrary data frames into clean data frames that are ready for analysis. A clean data frame:
To achieve this a number of techniques are used. Principally:
For more details see: the vtreat article
The main pattern is the use of designTreatmentsC() or designTreatmentsN() to design a treatment plan and then use the returned structure with prepare() to apply the plan to data frames. The main feature of vtreat is all data preparation is “y-aware” or uses the relations of effective variables to the dependent or outcome variable to encode the effective variables.
The structure returned from designTreatmentsN() or designTreatmentsC() includes informational fields. The main fields are mostly vectors with names (all with the same names in the same order):
In additon designTreatmentsC() returns one more vector
In all cases we have two upward biases on the scores
An example is:
library(vtreat)
dTrainC <- data.frame(x=c('a','a','a','b','b',NA),
z=c(1,2,3,4,NA,6),y=c(FALSE,FALSE,TRUE,FALSE,TRUE,TRUE))
head(dTrainC)
## x z y
## 1 a 1 FALSE
## 2 a 2 FALSE
## 3 a 3 TRUE
## 4 b 4 FALSE
## 5 b NA TRUE
## 6 <NA> 6 TRUE
dTestC <- data.frame(x=c('a','b','c',NA),z=c(10,20,30,NA))
head(dTestC)
## x z
## 1 a 10
## 2 b 20
## 3 c 30
## 4 <NA> NA
treatmentsC <- designTreatmentsC(dTrainC,colnames(dTrainC),'y',TRUE)
## [1] "desigining treatments Mon May 25 07:50:27 2015"
## [1] "design var x Mon May 25 07:50:27 2015"
## [1] "design var z Mon May 25 07:50:27 2015"
## [1] "scoring columns Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_lev_NA (derived from x ) Mon May 25 07:50:27 2015"
## [2] "score variable(s) x_lev_x.a (derived from x ) Mon May 25 07:50:27 2015"
## [3] "score variable(s) x_lev_x.b (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_catB (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_clean (derived from z ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_isBAD (derived from z ) Mon May 25 07:50:27 2015"
## [1] "have treatment plan Mon May 25 07:50:27 2015"
print(treatmentsC)
## $treatments
## $treatments[[1]]
## [1] "vtreat 'Categoric Indicators'('x'->character->'x_lev_NA','x_lev_x.a','x_lev_x.b')"
##
## $treatments[[2]]
## [1] "vtreat 'Bayesian Impact Code'('x'->character->'x_catB')"
##
## $treatments[[3]]
## [1] "vtreat 'Scalable pass through'('z'->numeric->'z_clean')"
##
## $treatments[[4]]
## [1] "vtreat 'is.bad'('z'->numeric->'z_isBAD')"
##
##
## $vars
## [1] "x_lev_NA" "x_lev_x.a" "x_lev_x.b" "x_catB" "z_clean" "z_isBAD"
##
## $varScores
## x_lev_NA x_lev_x.a x_lev_x.b x_catB z_clean z_isBAD
## 1.509383 1.999938 1.000000 1.374970 1.091783 1.509383
##
## $PRESSRsquared
## x_lev_NA x_lev_x.a x_lev_x.b x_catB z_clean z_isBAD
## -0.50938261 -0.99993834 0.00000000 -0.37496957 -0.09178333 -0.50938261
##
## $varMoves
## x_lev_NA x_lev_x.a x_lev_x.b x_catB z_clean z_isBAD
## TRUE TRUE FALSE TRUE TRUE TRUE
##
## $outcomename
## [1] "y"
##
## $meanY
## [1] 0.5
##
## $ndat
## [1] 6
##
## $catPseudoRSquared
## x_lev_NA x_lev_x.a x_lev_x.b x_catB z_clean
## 3.330669e-16 -2.220446e-16 0.000000e+00 5.888770e-02 -4.748721e-02
## z_isBAD
## 3.330669e-16
##
## attr(,"class")
## [1] "treatmentplan"
print(treatmentsC$treatments[[1]])
## [1] "vtreat 'Categoric Indicators'('x'->character->'x_lev_NA','x_lev_x.a','x_lev_x.b')"
dTrainCTreated <- prepare(treatmentsC,dTrainC,pruneLevel=c(),scale=TRUE)
head(dTrainCTreated)
## x_lev_NA x_lev_x.a x_catB z_clean z_isBAD y
## 1 -0.1 -0.1666667 -0.18838870 -3.864865e-01 -0.1 FALSE
## 2 -0.1 -0.1666667 -0.18838870 -2.108108e-01 -0.1 FALSE
## 3 -0.1 -0.1666667 -0.18838870 -3.513514e-02 -0.1 TRUE
## 4 -0.1 0.1666667 0.05164882 1.405405e-01 -0.1 FALSE
## 5 -0.1 0.1666667 0.05164882 -2.220446e-16 0.5 TRUE
## 6 0.5 0.1666667 0.46186845 4.918919e-01 -0.1 TRUE
varsC <- setdiff(colnames(dTrainCTreated),'y')
# all input variables should be mean 0
sapply(dTrainCTreated[,varsC,drop=FALSE],mean)
## x_lev_NA x_lev_x.a x_catB z_clean z_isBAD
## -2.543809e-17 5.551115e-17 4.163336e-17 -1.942890e-16 -2.543922e-17
# all slopes should be 1
sapply(varsC,function(c) { lm(paste('y',c,sep='~'),
data=dTrainCTreated)$coefficients[[2]]})
## x_lev_NA x_lev_x.a x_catB z_clean z_isBAD
## 1 1 1 1 1
dTestCTreated <- prepare(treatmentsC,dTestC,pruneLevel=c(),scale=TRUE)
head(dTestCTreated)
## x_lev_NA x_lev_x.a x_catB z_clean z_isBAD
## 1 -1.000000e-01 -1.666667e-01 -1.883887e-01 4.918919e-01 -0.1
## 2 -1.000000e-01 1.666667e-01 5.164882e-02 4.918919e-01 -0.1
## 3 -2.775558e-17 5.551115e-17 4.857226e-17 4.918919e-01 -0.1
## 4 5.000000e-01 1.666667e-01 4.618685e-01 -2.220446e-16 0.5
# numeric example
dTrainN <- data.frame(x=c('a','a','a','a','b','b',NA),
z=c(1,2,3,4,5,NA,7),y=c(0,0,0,1,0,1,1))
head(dTrainN)
## x z y
## 1 a 1 0
## 2 a 2 0
## 3 a 3 0
## 4 a 4 1
## 5 b 5 0
## 6 b NA 1
dTestN <- data.frame(x=c('a','b','c',NA),z=c(10,20,30,NA))
head(dTestN)
## x z
## 1 a 10
## 2 b 20
## 3 c 30
## 4 <NA> NA
treatmentsN = designTreatmentsN(dTrainN,colnames(dTrainN),'y')
## [1] "desigining treatments Mon May 25 07:50:27 2015"
## [1] "design var x Mon May 25 07:50:27 2015"
## [1] "design var z Mon May 25 07:50:27 2015"
## [1] "scoring columns Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_lev_NA (derived from x ) Mon May 25 07:50:27 2015"
## [2] "score variable(s) x_lev_x.a (derived from x ) Mon May 25 07:50:27 2015"
## [3] "score variable(s) x_lev_x.b (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) x_catN (derived from x ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_clean (derived from z ) Mon May 25 07:50:27 2015"
## [1] "score variable(s) z_isBAD (derived from z ) Mon May 25 07:50:27 2015"
## [1] "have treatment plan Mon May 25 07:50:27 2015"
print(treatmentsN)
## $treatments
## $treatments[[1]]
## [1] "vtreat 'Categoric Indicators'('x'->character->'x_lev_NA','x_lev_x.a','x_lev_x.b')"
##
## $treatments[[2]]
## [1] "vtreat 'Scalable Impact Code'('x'->character->'x_catN')"
##
## $treatments[[3]]
## [1] "vtreat 'Scalable pass through'('z'->numeric->'z_clean')"
##
## $treatments[[4]]
## [1] "vtreat 'is.bad'('z'->numeric->'z_isBAD')"
##
##
## $vars
## [1] "x_lev_NA" "x_lev_x.a" "x_lev_x.b" "x_catN" "z_clean" "z_isBAD"
##
## $varScores
## x_lev_NA x_lev_x.a x_lev_x.b x_catN z_clean z_isBAD
## 1.3958270 1.6527526 2.2591409 1.1111135 0.9911108 1.3958270
##
## $PRESSRsquared
## x_lev_NA x_lev_x.a x_lev_x.b x_catN z_clean
## -0.395826995 -0.652752554 -1.259140904 -0.111113490 0.008889188
## z_isBAD
## -0.395826995
##
## $varMoves
## x_lev_NA x_lev_x.a x_lev_x.b x_catN z_clean z_isBAD
## TRUE TRUE TRUE TRUE TRUE TRUE
##
## $outcomename
## [1] "y"
##
## $meanY
## [1] 0.4285714
##
## $ndat
## [1] 7
##
## attr(,"class")
## [1] "treatmentplan"
dTrainNTreated <- prepare(treatmentsN,dTrainN,
pruneLevel=c(),scale=TRUE)
head(dTrainNTreated)
## x_lev_NA x_lev_x.a x_lev_x.b x_catN z_clean z_isBAD y
## 1 -0.0952381 -0.1785714 -0.02857143 -0.17857143 -0.41904762 -0.0952381 0
## 2 -0.0952381 -0.1785714 -0.02857143 -0.17857143 -0.26190476 -0.0952381 0
## 3 -0.0952381 -0.1785714 -0.02857143 -0.17857143 -0.10476190 -0.0952381 0
## 4 -0.0952381 -0.1785714 -0.02857143 -0.17857143 0.05238095 -0.0952381 1
## 5 -0.0952381 0.2380952 0.07142857 0.07142857 0.20952381 -0.0952381 0
## 6 -0.0952381 0.2380952 0.07142857 0.07142857 0.00000000 0.5714286 1
varsN <- setdiff(colnames(dTrainNTreated),'y')
# all input variables should be mean 0
sapply(dTrainNTreated[,varsN,drop=FALSE],mean)
## x_lev_NA x_lev_x.a x_lev_x.b x_catN z_clean
## -7.930164e-17 -5.551115e-17 -6.393743e-17 -2.379049e-17 4.757324e-17
## z_isBAD
## -7.929874e-17
# all slopes should be 1
sapply(varsN,function(c) { lm(paste('y',c,sep='~'),
data=dTrainNTreated)$coefficients[[2]]})
## x_lev_NA x_lev_x.a x_lev_x.b x_catN z_clean z_isBAD
## 1 1 1 1 1 1
dTestNTreated <- prepare(treatmentsN,dTestN,
pruneLevel=c(),scale=TRUE)
head(dTestNTreated)
## x_lev_NA x_lev_x.a x_lev_x.b x_catN z_clean
## 1 -9.523810e-02 -1.785714e-01 -2.857143e-02 -1.785714e-01 0.5238095
## 2 -9.523810e-02 2.380952e-01 7.142857e-02 7.142857e-02 0.5238095
## 3 -8.326673e-17 -5.551115e-17 -6.591949e-17 -2.610216e-17 0.5238095
## 4 5.714286e-01 2.380952e-01 -2.857143e-02 5.714286e-01 0.0000000
## z_isBAD
## 1 -0.0952381
## 2 -0.0952381
## 3 -0.0952381
## 4 0.5714286