@@ -28,6 +28,12 @@ library(rqdatatable)
2828
2929``` r
3030library(vtreat )
31+ packageVersion(' vtreat' )
32+ ```
33+
34+ ## [1] '1.5.1'
35+
36+ ``` r
3137suppressPackageStartupMessages(library(ggplot2 ))
3238library(WVPlots )
3339```
@@ -44,6 +50,8 @@ Generate example data.
4450<!-- end list -->
4551
4652``` r
53+ set.seed(2020 )
54+
4755make_data <- function (nrows ) {
4856 d <- data.frame (x = 5 * rnorm(nrows ))
4957 d [' y' ] = sin(d [' x' ]) + 0.1 * rnorm(n = nrows )
@@ -62,14 +70,14 @@ d %.>%
6270 knitr :: kable(. )
6371```
6472
65- | x | y | xc | x2 | yc |
66- | ---------- : | ----------: | :---------- | ----------: | :---- |
67- | \- 0.1147391 | \- 0.0043260 | level\_ 0 | 0.9387403 | FALSE |
68- | \- 5.8194213 | 0.4713108 | level\_ 0.5 | \- 0.9525914 | FALSE |
69- | \- 8.4565114 | \- 0.7274771 | level\_ -0.5 | \- 0.7777899 | FALSE |
70- | NA | 0.3300295 | level\_ 0.5 | 0.0947080 | FALSE |
71- | NA | 0.1465502 | level \_ 0 | \- 0.1983596 | FALSE |
72- | NA | 0.6885266 | level\_ 0 .5 | \- 0.3045011 | TRUE |
73+ | x | y | xc | x2 | yc |
74+ | ---------: | ----------: | :---------- | ----------: | :---- |
75+ | 1.884861 | 1.0717646 | level\_ 1 | 0.0046504 | TRUE |
76+ | 1.507742 | 0.9958029 | level\_ 1 | \- 1.2287497 | TRUE |
77+ | \- 5.490116 | 0.8315705 | level\_ 1 | \- 0.1405980 | TRUE |
78+ | NA | 0.6007655 | level\_ 0.5 | \- 0.2073270 | TRUE |
79+ | NA | \- 0.8339836 | NA | \- 0.9215306 | FALSE |
80+ | NA | \- 0.5329006 | level\_ -0 .5 | 0.3604742 | FALSE |
7381
7482### Some quick data exploration
7583
@@ -80,28 +88,28 @@ unique(d['xc'])
8088```
8189
8290 ## xc
83- ## 1 level_0
84- ## 2 level_0.5
85- ## 3 level_-0.5
86- ## 8 level_1
87- ## 9 <NA>
88- ## 276 level_-1.5
91+ ## 1 level_1
92+ ## 4 level_0.5
93+ ## 5 <NA>
94+ ## 6 level_-0.5
95+ ## 27 level_0
96+ ## 269 level_-1.5
8997
9098``` r
9199table(d $ xc , useNA = ' always' )
92100```
93101
94102 ##
95103 ## level_-0.5 level_-1.5 level_0 level_0.5 level_1 <NA>
96- ## 94 1 79 106 113 107
104+ ## 94 1 85 98 103 119
97105
98106Find the mean value of ` yc `
99107
100108``` r
101109mean(d [[' yc' ]])
102110```
103111
104- ## [1] 0.318
112+ ## [1] 0.324
105113
106114Plot of ` yc ` versus ` x ` .
107115
@@ -182,16 +190,16 @@ knitr::kable(score_frame)
182190
183191| varName | varMoves | rsq | sig | needsSplit | extraModelDegrees | origName | code | recommended |
184192| :----------------------------- | :------- | --------: | --------: | :--------- | ----------------: | :------- | :---- | :---------- |
185- | x | TRUE | 0.0004439 | 0.5983019 | FALSE | 0 | x | clean | FALSE |
186- | x\_ isBAD | TRUE | 0.0000556 | 0.8520676 | FALSE | 0 | x | isBAD | FALSE |
187- | xc\_ catP | TRUE | 0.2293441 | 0.0000000 | TRUE | 5 | xc | catP | TRUE |
188- | xc\_ catB | TRUE | 0.7647009 | 0.0000000 | TRUE | 5 | xc | catB | TRUE |
189- | x2 | TRUE | 0.0068394 | 0.0386304 | FALSE | 0 | x2 | clean | TRUE |
190- | xc\_ lev\_ NA | TRUE | 0.1518186 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
191- | xc\_ lev\_ x\_ level\_ minus\_ 0\_ 5 | TRUE | 0.1307155 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
192- | xc\_ lev\_ x\_ level\_ 0 | TRUE | 0.1074272 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
193- | xc\_ lev\_ x\_ level\_ 0\_ 5 | TRUE | 0.0128574 | 0.0045745 | FALSE | 0 | xc | lev | TRUE |
194- | xc\_ lev\_ x\_ level\_ 1 | TRUE | 0.5486677 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
193+ | x | TRUE | 0.0005756 | 0.5470919 | FALSE | 0 | x | clean | FALSE |
194+ | x\_ isBAD | TRUE | 0.0000771 | 0.8255885 | FALSE | 0 | x | isBAD | FALSE |
195+ | xc\_ catP | TRUE | 0.0008468 | 0.4652101 | TRUE | 5 | xc | catP | FALSE |
196+ | xc\_ catB | TRUE | 0.7883578 | 0.0000000 | TRUE | 5 | xc | catB | TRUE |
197+ | x2 | TRUE | 0.0026075 | 0.2000083 | FALSE | 0 | x2 | clean | FALSE |
198+ | xc\_ lev\_ NA | TRUE | 0.1750095 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
199+ | xc\_ lev\_ x\_ level\_ minus\_ 0\_ 5 | TRUE | 0.1328708 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
200+ | xc\_ lev\_ x\_ level\_ 0 | TRUE | 0.1185254 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
201+ | xc\_ lev\_ x\_ level\_ 0\_ 5 | TRUE | 0.0644178 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
202+ | xc\_ lev\_ x\_ level\_ 1 | TRUE | 0.4701626 | 0.0000000 | FALSE | 0 | xc | lev | TRUE |
195203
196204Note that the variable ` xc ` has been converted to multiple variables:
197205
@@ -227,9 +235,7 @@ score_frame[score_frame[['recommended']], 'varName', drop = FALSE] %.>%
227235
228236| | varName |
229237| -- | :----------------------------- |
230- | 3 | xc\_ catP |
231238| 4 | xc\_ catB |
232- | 5 | x2 |
233239| 6 | xc\_ lev\_ NA |
234240| 7 | xc\_ lev\_ x\_ level\_ minus\_ 0\_ 5 |
235241| 8 | xc\_ lev\_ x\_ level\_ 0 |
@@ -242,10 +248,12 @@ score_frame[!score_frame[['recommended']], 'varName', drop = FALSE] %.>%
242248 knitr :: kable(. )
243249```
244250
245- | varName |
246- | :------- |
247- | x |
248- | x\_ isBAD |
251+ | | varName |
252+ | - | :------- |
253+ | 1 | x |
254+ | 2 | x\_ isBAD |
255+ | 3 | xc\_ catP |
256+ | 5 | x2 |
249257
250258Notice that ` d_prepared ` only includes derived variables and the outcome
251259` y ` :
@@ -256,14 +264,14 @@ d_prepared %.>%
256264 knitr :: kable(. )
257265```
258266
259- | x | x\_ isBAD | xc\_ catP | xc\_ catB | x2 | xc\_ lev\_ NA | xc\_ lev\_ x\_ level\_ minus\_ 0\_ 5 | xc\_ lev\_ x\_ level\_ 0 | xc\_ lev\_ x\_ level\_ 0\_ 5 | xc\_ lev\_ x\_ level\_ 1 | yc |
260- | ----------: | -------: | --------: | ----------- : | ----------: | ----------: | -----------------------------: | -------------------: | ----------------------: | -------------------: | :---- |
261- | \- 0.1147391 | 0 | 0.1586826 | \- 12.4285135 | 0.9387403 | 0 | 0 | 1 | 0 | 0 | FALSE |
262- | \- 5.8194213 | 0 | 0.2005988 | 0.4207646 | \- 0.9525914 | 0 | 0 | 0 | 1 | 0 | FALSE |
263- | \- 8.4565114 | 0 | 0.1766467 | \- 12.5357588 | \- 0.7777899 | 0 | 1 | 0 | 0 | 0 | FALSE |
264- | \- 0.4097425 | 1 | 0.2342342 | 0.5695332 | 0.0947080 | 0 | 0 | 0 | 1 | 0 | FALSE |
265- | \- 0.4097425 | 1 | 0.1501502 | \- 12.3469806 | \- 0.1983596 | 0 | 0 | 1 | 0 | 0 | FALSE |
266- | \- 0.1622745 | 1 | 0.2012012 | 0.4912209 | \- 0.3045011 | 0 | 0 | 0 | 1 | 0 | TRUE |
267+ | x | x\_ isBAD | xc\_ catP | xc\_ catB | x2 | xc\_ lev\_ NA | xc\_ lev\_ x\_ level\_ minus\_ 0\_ 5 | xc\_ lev\_ x\_ level\_ 0 | xc\_ lev\_ x\_ level\_ 0\_ 5 | xc\_ lev\_ x\_ level\_ 1 | yc |
268+ | ----------: | -------: | --------: | ----------: | ----------: | ----------: | -----------------------------: | -------------------: | ----------------------: | -------------------: | :---- |
269+ | 1.8848606 | 0 | 0.2102102 | 14.206543 | 0.0046504 | 0 | 0 | 0 | 0 | 1 | TRUE |
270+ | 1.5077419 | 0 | 0.2005988 | 14.139786 | \- 1.2287497 | 0 | 0 | 0 | 0 | 1 | TRUE |
271+ | \- 5.4901159 | 0 | 0.2005988 | 14.139786 | \- 0.1405980 | 0 | 0 | 0 | 0 | 1 | TRUE |
272+ | \- 0.1276897 | 1 | 0.1891892 | 1.219475 | \- 0.2073270 | 0 | 0 | 0 | 1 | 0 | TRUE |
273+ | \- 0.3929879 | 1 | 0.2402402 | \- 12.844663 | \- 0.9215306 | 1 | 0 | 0 | 0 | 0 | FALSE |
274+ | \- 0.2908461 | 1 | 0.1766467 | \- 12.563128 | 0.3604742 | 0 | 1 | 0 | 0 | 0 | FALSE |
267275
268276## A Closer Look at ` catB ` variables
269277
@@ -520,9 +528,9 @@ d %.>%
520528
521529| rsq | count | sig | var |
522530| -----------: | ----: | --------: | :-- |
523- | 0.0004438531 | 2 | 1.0000000 | x |
524- | 0.0068394173 | 3 | 0.1158911 | x2 |
525- | 0.7638059755 | 2 | 0.0000000 | xc |
531+ | 0.0005756197 | 2 | 1.0000000 | x |
532+ | 0.0026074775 | 3 | 0.6000248 | x2 |
533+ | 0.7883476986 | 2 | 0.0000000 | xc |
526534
527535More on non-linear variable scoring can be found
528536[ here] ( https://cran.r-project.org/web/packages/vtreat/vignettes/VariableImportance.html ) .
0 commit comments