WinVector
diff --git a/‎pkg/.idea/.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎pkg/.idea/.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/.idea/encodings.xml‎
Lines changed: 6 additions & 0 deletions b/‎pkg/.idea/encodings.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/.idea/inspectionProfiles/profiles_settings.xml‎
Lines changed: 6 additions & 0 deletions b/‎pkg/.idea/inspectionProfiles/profiles_settings.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/.idea/misc.xml‎
Lines changed: 4 additions & 0 deletions b/‎pkg/.idea/misc.xml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pkg/.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions b/‎pkg/.idea/modules.xml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎pkg/.idea/pkg.iml‎
Lines changed: 12 additions & 0 deletions b/‎pkg/.idea/pkg.iml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎pkg/.idea/vcs.xml‎
Lines changed: 6 additions & 0 deletions b/‎pkg/.idea/vcs.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎pkg/build/lib/wvpy/util.py‎
Lines changed: 97 additions & 78 deletions b/‎pkg/build/lib/wvpy/util.py‎
Lines changed: 97 additions & 78 deletions
diff --git a/‎pkg/dist/wvpy-0.1-py3-none-any.whl‎
-17 Bytes b/‎pkg/dist/wvpy-0.1-py3-none-any.whl‎
-17 Bytes
diff --git a/‎pkg/dist/wvpy-0.1.tar.gz‎
-2 Bytes b/‎pkg/dist/wvpy-0.1.tar.gz‎
-2 Bytes
@@ -1,4 +1,3 @@
-
 import numpy
 import statistics
 import matplotlib
@@ -10,17 +9,16 @@
 import pandas
 
 
-
 def cross_predict_model(fitter, X, Y, plan):
     """train a model Y~X using the cross validation plan and return predictions"""
-    preds = [None]*X.shape[0]
+    preds = [None] * X.shape[0]
     for g in range(len(plan)):
         pi = plan[g]
         model = fitter.fit(X.iloc[pi["train"]], Y.iloc[pi["train"]])
         predg = model.predict(X.iloc[pi["test"]])
         for i in range(len(pi["test"])):
             preds[pi["test"][i]] = predg[i]
-    return(preds)
+    return preds
 
 
 def cross_predict_model_prob(fitter, X, Y, plan):
@@ -31,38 +29,39 @@ def cross_predict_model_prob(fitter, X, Y, plan):
         model = fitter.fit(X.iloc[pi["train"]], Y.iloc[pi["train"]])
         predg = model.predict_proba(X.iloc[pi["test"]])
         for i in range(len(pi["test"])):
-            preds[pi["test"][i],0] = predg[i,0]
-            preds[pi["test"][i],1] = predg[i,1]
-    return(preds)
-
-
+            preds[pi["test"][i], 0] = predg[i, 0]
+            preds[pi["test"][i], 1] = predg[i, 1]
+    return preds
 
 
 def mean_deviance(predictions, istrue):
     """compute per-row deviance of predictions versus istrue"""
-    mass_on_correct = [ predictions[i,1] if istrue[i] else predictions[i,0] for i in range(len(istrue)) ]
-    return(-2*sum(numpy.log(mass_on_correct))/len(istrue))
-
+    mass_on_correct = [
+        predictions[i, 1] if istrue[i] else predictions[i, 0]
+        for i in range(len(istrue))
+    ]
+    return -2 * sum(numpy.log(mass_on_correct)) / len(istrue)
 
 
 def mean_null_deviance(istrue):
     """compute per-row nulll deviance of predictions versus istrue"""
     p = numpy.mean(istrue)
-    mass_on_correct = [ p if istrue[i] else 1-p for i in range(len(istrue)) ]
-    return(-2*sum(numpy.log(mass_on_correct))/len(istrue))
-
+    mass_on_correct = [p if istrue[i] else 1 - p for i in range(len(istrue))]
+    return -2 * sum(numpy.log(mass_on_correct)) / len(istrue)
 
 
 def mk_cross_plan(n, k):
     """randomly split range(n) into k disjoint groups"""
     grp = [i % k for i in range(n)]
     numpy.random.shuffle(grp)
-    plan = [ { "train"  : [i for i in range(n) if grp[i] != j],
-               "test" : [i for i in range(n) if grp[i] == j] } for j in range(k) ]
-    return(plan)
-
-
-
+    plan = [
+        {
+            "train": [i for i in range(n) if grp[i] != j],
+            "test": [i for i in range(n) if grp[i] == j],
+        }
+        for j in range(k)
+    ]
+    return plan
 
 
 # https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
@@ -73,132 +72,152 @@ def plot_roc(prediction, istrue):
     matplotlib.pyplot.figure()
     lw = 2
     matplotlib.pyplot.gcf().clear()
-    matplotlib.pyplot.plot(fpr, tpr, color='darkorange',
-         lw=lw, 
-         label='ROC curve  (area = {0:0.2f})'
-         ''.format(auc))
-    matplotlib.pyplot.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
+    matplotlib.pyplot.plot(
+        fpr,
+        tpr,
+        color="darkorange",
+        lw=lw,
+        label="ROC curve  (area = {0:0.2f})" "".format(auc),
+    )
+    matplotlib.pyplot.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
     matplotlib.pyplot.xlim([0.0, 1.0])
     matplotlib.pyplot.ylim([0.0, 1.05])
-    matplotlib.pyplot.xlabel('False Positive Rate')
-    matplotlib.pyplot.ylabel('True Positive Rate')
-    matplotlib.pyplot.title('Receiver operating characteristic example')
+    matplotlib.pyplot.xlabel("False Positive Rate")
+    matplotlib.pyplot.ylabel("True Positive Rate")
+    matplotlib.pyplot.title("Receiver operating characteristic example")
     matplotlib.pyplot.legend(loc="lower right")
     matplotlib.pyplot.show()
-    return(auc)
+    return auc
+
 
 def dual_density_plot(probs, istrue):
     """plot a dual density plot of numeric prediction probs against boolean istrue"""
     matplotlib.pyplot.gcf().clear()
-    preds_on_positive = [ probs[i] for i in range(len(probs)) if istrue[i] ]
-    preds_on_negative = [ probs[i] for i in range(len(probs)) if not istrue[i] ]
-    seaborn.kdeplot(preds_on_positive, label = "positive examples", bw=0.01)
-    seaborn.kdeplot(preds_on_negative , label = "negative examples", bw=0.01)
+    preds_on_positive = [probs[i] for i in range(len(probs)) if istrue[i]]
+    preds_on_negative = [probs[i] for i in range(len(probs)) if not istrue[i]]
+    seaborn.kdeplot(preds_on_positive, label="positive examples", bw=0.01)
+    seaborn.kdeplot(preds_on_negative, label="negative examples", bw=0.01)
     matplotlib.pyplot.ylabel("density of examples")
     matplotlib.pyplot.xlabel("model score")
     matplotlib.pyplot.show()
 
+
 def dual_density_plot_proba1(probs, istrue):
     """plot a dual density plot of numeric prediction probs[:,1] against boolean istrue"""
     matplotlib.pyplot.gcf().clear()
-    preds_on_positive = [ probs[i,1] for i in range(len(probs)) if istrue[i] ]
-    preds_on_negative = [ probs[i,1] for i in range(len(probs)) if not istrue[i] ]
-    seaborn.kdeplot(preds_on_positive, label = "positive examples", bw=0.01)
-    seaborn.kdeplot(preds_on_negative , label = "negative examples", bw=0.01)
+    preds_on_positive = [probs[i, 1] for i in range(len(probs)) if istrue[i]]
+    preds_on_negative = [probs[i, 1] for i in range(len(probs)) if not istrue[i]]
+    seaborn.kdeplot(preds_on_positive, label="positive examples", bw=0.01)
+    seaborn.kdeplot(preds_on_negative, label="negative examples", bw=0.01)
     matplotlib.pyplot.ylabel("density of examples")
     matplotlib.pyplot.xlabel("model score")
     matplotlib.pyplot.show()
 
+
 def dual_hist_plot_proba1(probs, istrue):
     """plot a dual histogram plot of numeric prediction probs[:,1] against boolean istrue"""
     matplotlib.pyplot.gcf().clear()
-    pf = pandas.DataFrame({'prob' : [ probs[i,1] for i in range(probs.shape[0])], 'istrue' : istrue})
+    pf = pandas.DataFrame(
+        {"prob": [probs[i, 1] for i in range(probs.shape[0])], "istrue": istrue}
+    )
     g = seaborn.FacetGrid(pf, row="istrue", height=4, aspect=3)
     bins = numpy.arange(0, 1.1, 0.1)
-    g = g.map(matplotlib.pyplot.hist, "prob", bins=bins)
-    #g = g.map(seaborn.distplot, "prob", bins=bins)
+    g.map(matplotlib.pyplot.hist, "prob", bins=bins)
     matplotlib.pyplot.show()
-    
+
 
 def gain_curve_plot(prediction, outcome):
     """plot cumulative outcome as a function of prediction order (descending)"""
-    df = pandas.DataFrame({"prediction":prediction, "outcome":outcome})
+    df = pandas.DataFrame({"prediction": prediction, "outcome": outcome})
     df.sort_values(["prediction"], ascending=[False], inplace=True)
-    df["fraction_of_observations_by_prediction"] = [(1+i)/df.shape[0] for i in range(df.shape[0])]
+    df["fraction_of_observations_by_prediction"] = [
+        (1 + i) / df.shape[0] for i in range(df.shape[0])
+    ]
     df["cumulative_outcome"] = df["outcome"].cumsum()
-    df["cumulative_outcome_fraction"] = df["cumulative_outcome"]/numpy.max(df["cumulative_outcome"])
-    seaborn.scatterplot(x = "fraction_of_observations_by_prediction", 
-                        y = "cumulative_outcome_fraction", 
-                        data = df)
-    seaborn.lineplot(x=[0,1], 
-                     y=[0,1], 
-                     color="red")
+    df["cumulative_outcome_fraction"] = df["cumulative_outcome"] / numpy.max(
+        df["cumulative_outcome"]
+    )
+    seaborn.scatterplot(
+        x="fraction_of_observations_by_prediction",
+        y="cumulative_outcome_fraction",
+        data=df,
+    )
+    seaborn.lineplot(x=[0, 1], y=[0, 1], color="red")
+
 
 def lift_curve_plot(prediction, outcome):
     """plot lift as a function of prediction order (descending)"""
-    df = pandas.DataFrame({"prediction":prediction, "outcome":outcome})
+    df = pandas.DataFrame({"prediction": prediction, "outcome": outcome})
     df.sort_values(["prediction"], ascending=[False], inplace=True)
-    df["fraction_of_observations_by_prediction"] = [(1+i)/df.shape[0] for i in range(df.shape[0])]
+    df["fraction_of_observations_by_prediction"] = [
+        (1 + i) / df.shape[0] for i in range(df.shape[0])
+    ]
     df["cumulative_outcome"] = df["outcome"].cumsum()
-    df["cumulative_outcome_fraction"] = df["cumulative_outcome"]/numpy.max(df["cumulative_outcome"])
-    df["lift"] = df["cumulative_outcome_fraction"]/df["fraction_of_observations_by_prediction"]
-    seaborn.scatterplot(x = "fraction_of_observations_by_prediction", 
-                        y = "lift", 
-                        data = df)
+    df["cumulative_outcome_fraction"] = df["cumulative_outcome"] / numpy.max(
+        df["cumulative_outcome"]
+    )
+    df["lift"] = (
+        df["cumulative_outcome_fraction"] / df["fraction_of_observations_by_prediction"]
+    )
+    seaborn.scatterplot(x="fraction_of_observations_by_prediction", y="lift", data=df)
     matplotlib.pyplot.axhline(y=1, color="red")
 
 
 def dual_hist_plot(probs, istrue):
     """plot a dual histogram plot of numeric prediction probs against boolean istrue"""
     matplotlib.pyplot.gcf().clear()
-    pf = pandas.DataFrame({'prob' : [ probs[i] for i in range(probs.shape[0])], 'istrue' : istrue})
+    pf = pandas.DataFrame(
+        {"prob": [probs[i] for i in range(probs.shape[0])], "istrue": istrue}
+    )
     g = seaborn.FacetGrid(pf, row="istrue", height=4, aspect=3)
     bins = numpy.arange(0, 1.1, 0.1)
-    g = g.map(matplotlib.pyplot.hist, "prob", bins=bins)
-    #g = g.map(seaborn.distplot, "prob", bins=bins)
+    g.map(matplotlib.pyplot.hist, "prob", bins=bins)
     matplotlib.pyplot.show()
 
 
 # https://stackoverflow.com/questions/5228158/cartesian-product-of-a-dictionary-of-lists
 def search_grid(inp):
     """build a cross product of all named dictionary entries"""
     gen = (dict(zip(inp.keys(), values)) for values in itertools.product(*inp.values()))
-    return([ci for ci in gen])
+    return [ci for ci in gen]
 
 
 def grid_to_df(grid):
     """convert a search_grid list of maps to a pandas data frame"""
     n = len(grid)
-    keys = [ ki for ki in grid[1].keys() ]
-    return(pandas.DataFrame({ ki : [ grid[i][ki] for i in range(n) ] for ki in keys }))
+    keys = [ki for ki in grid[1].keys()]
+    return pandas.DataFrame({ki: [grid[i][ki] for i in range(n)] for ki in keys})
 
 
 def eval_fn_per_row(f, x2, df):
     """evaluate f(row-as-map, x2) for rows in df"""
-    return([ f({ k : df.loc[i, k] for k in df.columns }, x2) for i in range(df.shape[0]) ])
+    return [f({k: df.loc[i, k] for k in df.columns}, x2) for i in range(df.shape[0])]
 
 
-def perm_score_vars(d, istrue, model, modelvars, k = 5):
+def perm_score_vars(d, istrue, model, modelvars, k=5):
     """evaluate model~istrue on d permuting each of the modelvars and return variable importances"""
     d2 = d.copy()
     preds = model.predict_proba(d2[modelvars])
     basedev = mean_deviance(preds, istrue)
+
     def perm_score_var(victim):
         dorig = numpy.array(d2[victim].copy())
         dnew = numpy.array(d2[victim].copy())
+
         def perm_score_var_once():
             numpy.random.shuffle(dnew)
             d2[victim] = dnew
-            preds = model.predict_proba(d2[modelvars])
-            permdev = mean_deviance(preds, istrue)
-            return(permdev)
-        devs = [ perm_score_var_once() for rep in range(k) ]
-        d2[victim] = dorig
-        return(numpy.mean(devs), statistics.stdev(devs))
-    stats = [ perm_score_var(victim) for victim in modelvars ]
-    vf = pandas.DataFrame({"var" : modelvars})
-    vf["importance"] = [ di[0]  - basedev for di in stats ]
-    vf["importance_dev"] = [ di[1] for di in stats ]
-    vf.sort_values(by = ["importance"], ascending = False, inplace = True)
-    return(vf)
+            predsp = model.predict_proba(d2[modelvars])
+            permdev = mean_deviance(predsp, istrue)
+            return permdev
 
+        devs = [perm_score_var_once() for rep in range(k)]
+        d2[victim] = dorig
+        return numpy.mean(devs), statistics.stdev(devs)
+
+    stats = [perm_score_var(victim) for victim in modelvars]
+    vf = pandas.DataFrame({"var": modelvars})
+    vf["importance"] = [di[0] - basedev for di in stats]
+    vf["importance_dev"] = [di[1] for di in stats]
+    vf.sort_values(by=["importance"], ascending=False, inplace=True)
+    return vf