Skip to content

Commit

Permalink
get KDD example working in database
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Jan 10, 2022
1 parent 8dbbfe0 commit d1ac0a1
Show file tree
Hide file tree
Showing 10 changed files with 517 additions and 161 deletions.
283 changes: 207 additions & 76 deletions Examples/KDD2009Example/KDD2009Example.ipynb

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions coverage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ pkg/vtreat/stats_utils.py 83 3 96%
pkg/vtreat/transform.py 17 4 76%
pkg/vtreat/util.py 152 21 86%
pkg/vtreat/vtreat_api.py 286 35 88%
pkg/vtreat/vtreat_db_adapter.py 65 0 100%
pkg/vtreat/vtreat_impl.py 710 78 89%
pkg/vtreat/vtreat_db_adapter.py 69 0 100%
pkg/vtreat/vtreat_impl.py 717 78 89%
-----------------------------------------------------
TOTAL 1372 152 89%
TOTAL 1383 152 89%


============================= 33 passed in 21.92s ==============================
============================= 33 passed in 22.18s ==============================
2 changes: 1 addition & 1 deletion docs/search.js

Large diffs are not rendered by default.

34 changes: 26 additions & 8 deletions docs/vtreat/vtreat_db_adapter.html
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ <h1 class="modulename">
<span class="n">vtreat_descr</span><span class="p">[</span><span class="s2">&quot;value&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">replace_bad_with_sentinel</span><span class="p">(</span><span class="n">vtreat_descr</span><span class="p">[</span><span class="s2">&quot;value&quot;</span><span class="p">])</span>
<span class="c1"># check our expected invariants</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">vtreat_descr</span><span class="p">,</span> <span class="n">pandas</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span>
<span class="c1"># numeric is a function of original variable only</span>
<span class="n">check_fn_relnn</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">data</span><span class="p">(</span><span class="n">vtreat_descr</span><span class="o">=</span><span class="n">vtreat_descr</span><span class="p">)</span>
<span class="o">.</span><span class="n">project</span><span class="p">({},</span> <span class="n">group_by</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;orig_var&quot;</span><span class="p">,</span> <span class="s2">&quot;orig_was_numeric&quot;</span><span class="p">])</span>
<span class="o">.</span><span class="n">extend</span><span class="p">({</span><span class="s2">&quot;one&quot;</span><span class="p">:</span> <span class="mi">1</span><span class="p">})</span>
<span class="o">.</span><span class="n">project</span><span class="p">({</span><span class="s2">&quot;count&quot;</span><span class="p">:</span> <span class="s2">&quot;one.sum()&quot;</span><span class="p">},</span> <span class="n">group_by</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;orig_var&quot;</span><span class="p">])</span>
<span class="p">)</span><span class="o">.</span><span class="n">ex</span><span class="p">()</span>
<span class="k">assert</span> <span class="n">numpy</span><span class="o">.</span><span class="n">all</span><span class="p">(</span><span class="n">check_fn_relnn</span><span class="p">[</span><span class="s2">&quot;count&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="mi">1</span><span class="p">)</span>
<span class="c1"># variable consumed is function of variable produced and treatment only</span>
<span class="n">check_fn_reln2</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">data</span><span class="p">(</span><span class="n">vtreat_descr</span><span class="o">=</span><span class="n">vtreat_descr</span><span class="p">)</span>
Expand Down Expand Up @@ -157,10 +165,15 @@ <h1 class="modulename">
<span class="n">vtreat_descr</span><span class="p">[</span><span class="s2">&quot;treatment_class&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;IndicateMissingTransform&quot;</span><span class="p">,</span> <span class="p">:</span>
<span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">inplace</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">im_rows</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]):</span>
<span class="n">step_1_ops</span><span class="p">[</span>
<span class="n">im_rows</span><span class="p">[</span><span class="s2">&quot;variable&quot;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_var&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span><span class="si">}</span><span class="s2">.is_bad().if_else(1.0, 0.0)&quot;</span>
<span class="c1"># add in general value indicators or dummies</span>
<span class="k">if</span> <span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_was_numeric&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]:</span>
<span class="n">step_1_ops</span><span class="p">[</span>
<span class="n">im_rows</span><span class="p">[</span><span class="s2">&quot;variable&quot;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_var&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span><span class="si">}</span><span class="s2">.is_bad().if_else(1.0, 0.0)&quot;</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">step_1_ops</span><span class="p">[</span>
<span class="n">im_rows</span><span class="p">[</span><span class="s2">&quot;variable&quot;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;(</span><span class="si">{</span><span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_var&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span><span class="si">}</span><span class="s2">.coalesce(&#39;</span><span class="si">{</span><span class="n">bad_sentinel</span><span class="si">}</span><span class="s2">&#39;) == &#39;</span><span class="si">{</span><span class="n">bad_sentinel</span><span class="si">}</span><span class="s2">&#39;).if_else(1.0, 0.0)&quot;</span>
<span class="c1"># add in general value indicators or dummies, all indicators are non-numeric (string)</span>
<span class="n">ic_rows</span> <span class="o">=</span> <span class="n">vtreat_descr</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span>
<span class="n">vtreat_descr</span><span class="p">[</span><span class="s2">&quot;treatment_class&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;IndicatorCodeTransform&quot;</span><span class="p">,</span> <span class="p">:</span>
<span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">inplace</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
Expand Down Expand Up @@ -288,10 +301,15 @@ <h1 class="modulename">
<span class="n">vtreat_descr</span><span class="p">[</span><span class="s2">&quot;treatment_class&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;IndicateMissingTransform&quot;</span><span class="p">,</span> <span class="p">:</span>
<span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">inplace</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">im_rows</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]):</span>
<span class="n">step_1_ops</span><span class="p">[</span>
<span class="n">im_rows</span><span class="p">[</span><span class="s2">&quot;variable&quot;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_var&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span><span class="si">}</span><span class="s2">.is_bad().if_else(1.0, 0.0)&quot;</span>
<span class="c1"># add in general value indicators or dummies</span>
<span class="k">if</span> <span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_was_numeric&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]:</span>
<span class="n">step_1_ops</span><span class="p">[</span>
<span class="n">im_rows</span><span class="p">[</span><span class="s2">&quot;variable&quot;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_var&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span><span class="si">}</span><span class="s2">.is_bad().if_else(1.0, 0.0)&quot;</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">step_1_ops</span><span class="p">[</span>
<span class="n">im_rows</span><span class="p">[</span><span class="s2">&quot;variable&quot;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;(</span><span class="si">{</span><span class="n">im_rows</span><span class="p">[</span><span class="s1">&#39;orig_var&#39;</span><span class="p">][</span><span class="n">i</span><span class="p">]</span><span class="si">}</span><span class="s2">.coalesce(&#39;</span><span class="si">{</span><span class="n">bad_sentinel</span><span class="si">}</span><span class="s2">&#39;) == &#39;</span><span class="si">{</span><span class="n">bad_sentinel</span><span class="si">}</span><span class="s2">&#39;).if_else(1.0, 0.0)&quot;</span>
<span class="c1"># add in general value indicators or dummies, all indicators are non-numeric (string)</span>
<span class="n">ic_rows</span> <span class="o">=</span> <span class="n">vtreat_descr</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span>
<span class="n">vtreat_descr</span><span class="p">[</span><span class="s2">&quot;treatment_class&quot;</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;IndicatorCodeTransform&quot;</span><span class="p">,</span> <span class="p">:</span>
<span class="p">]</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">inplace</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
Expand Down
Loading

0 comments on commit d1ac0a1

Please sign in to comment.