Skip to content

Commit a03417e

Browse files
committed
reformat and clean up project
1 parent 3251b5f commit a03417e

File tree

11 files changed

+238
-156
lines changed

11 files changed

+238
-156
lines changed

pkg/.idea/.gitignore

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/.idea/encodings.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/.idea/inspectionProfiles/profiles_settings.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/.idea/misc.xml

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/.idea/modules.xml

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/.idea/pkg.iml

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/.idea/vcs.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/build/lib/wvpy/util.py

Lines changed: 97 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
21
import numpy
32
import statistics
43
import matplotlib
@@ -10,17 +9,16 @@
109
import pandas
1110

1211

13-
1412
def cross_predict_model(fitter, X, Y, plan):
1513
"""train a model Y~X using the cross validation plan and return predictions"""
16-
preds = [None]*X.shape[0]
14+
preds = [None] * X.shape[0]
1715
for g in range(len(plan)):
1816
pi = plan[g]
1917
model = fitter.fit(X.iloc[pi["train"]], Y.iloc[pi["train"]])
2018
predg = model.predict(X.iloc[pi["test"]])
2119
for i in range(len(pi["test"])):
2220
preds[pi["test"][i]] = predg[i]
23-
return(preds)
21+
return preds
2422

2523

2624
def cross_predict_model_prob(fitter, X, Y, plan):
@@ -31,38 +29,39 @@ def cross_predict_model_prob(fitter, X, Y, plan):
3129
model = fitter.fit(X.iloc[pi["train"]], Y.iloc[pi["train"]])
3230
predg = model.predict_proba(X.iloc[pi["test"]])
3331
for i in range(len(pi["test"])):
34-
preds[pi["test"][i],0] = predg[i,0]
35-
preds[pi["test"][i],1] = predg[i,1]
36-
return(preds)
37-
38-
32+
preds[pi["test"][i], 0] = predg[i, 0]
33+
preds[pi["test"][i], 1] = predg[i, 1]
34+
return preds
3935

4036

4137
def mean_deviance(predictions, istrue):
4238
"""compute per-row deviance of predictions versus istrue"""
43-
mass_on_correct = [ predictions[i,1] if istrue[i] else predictions[i,0] for i in range(len(istrue)) ]
44-
return(-2*sum(numpy.log(mass_on_correct))/len(istrue))
45-
39+
mass_on_correct = [
40+
predictions[i, 1] if istrue[i] else predictions[i, 0]
41+
for i in range(len(istrue))
42+
]
43+
return -2 * sum(numpy.log(mass_on_correct)) / len(istrue)
4644

4745

4846
def mean_null_deviance(istrue):
4947
"""compute per-row nulll deviance of predictions versus istrue"""
5048
p = numpy.mean(istrue)
51-
mass_on_correct = [ p if istrue[i] else 1-p for i in range(len(istrue)) ]
52-
return(-2*sum(numpy.log(mass_on_correct))/len(istrue))
53-
49+
mass_on_correct = [p if istrue[i] else 1 - p for i in range(len(istrue))]
50+
return -2 * sum(numpy.log(mass_on_correct)) / len(istrue)
5451

5552

5653
def mk_cross_plan(n, k):
5754
"""randomly split range(n) into k disjoint groups"""
5855
grp = [i % k for i in range(n)]
5956
numpy.random.shuffle(grp)
60-
plan = [ { "train" : [i for i in range(n) if grp[i] != j],
61-
"test" : [i for i in range(n) if grp[i] == j] } for j in range(k) ]
62-
return(plan)
63-
64-
65-
57+
plan = [
58+
{
59+
"train": [i for i in range(n) if grp[i] != j],
60+
"test": [i for i in range(n) if grp[i] == j],
61+
}
62+
for j in range(k)
63+
]
64+
return plan
6665

6766

6867
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
@@ -73,132 +72,152 @@ def plot_roc(prediction, istrue):
7372
matplotlib.pyplot.figure()
7473
lw = 2
7574
matplotlib.pyplot.gcf().clear()
76-
matplotlib.pyplot.plot(fpr, tpr, color='darkorange',
77-
lw=lw,
78-
label='ROC curve (area = {0:0.2f})'
79-
''.format(auc))
80-
matplotlib.pyplot.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
75+
matplotlib.pyplot.plot(
76+
fpr,
77+
tpr,
78+
color="darkorange",
79+
lw=lw,
80+
label="ROC curve (area = {0:0.2f})" "".format(auc),
81+
)
82+
matplotlib.pyplot.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
8183
matplotlib.pyplot.xlim([0.0, 1.0])
8284
matplotlib.pyplot.ylim([0.0, 1.05])
83-
matplotlib.pyplot.xlabel('False Positive Rate')
84-
matplotlib.pyplot.ylabel('True Positive Rate')
85-
matplotlib.pyplot.title('Receiver operating characteristic example')
85+
matplotlib.pyplot.xlabel("False Positive Rate")
86+
matplotlib.pyplot.ylabel("True Positive Rate")
87+
matplotlib.pyplot.title("Receiver operating characteristic example")
8688
matplotlib.pyplot.legend(loc="lower right")
8789
matplotlib.pyplot.show()
88-
return(auc)
90+
return auc
91+
8992

9093
def dual_density_plot(probs, istrue):
9194
"""plot a dual density plot of numeric prediction probs against boolean istrue"""
9295
matplotlib.pyplot.gcf().clear()
93-
preds_on_positive = [ probs[i] for i in range(len(probs)) if istrue[i] ]
94-
preds_on_negative = [ probs[i] for i in range(len(probs)) if not istrue[i] ]
95-
seaborn.kdeplot(preds_on_positive, label = "positive examples", bw=0.01)
96-
seaborn.kdeplot(preds_on_negative , label = "negative examples", bw=0.01)
96+
preds_on_positive = [probs[i] for i in range(len(probs)) if istrue[i]]
97+
preds_on_negative = [probs[i] for i in range(len(probs)) if not istrue[i]]
98+
seaborn.kdeplot(preds_on_positive, label="positive examples", bw=0.01)
99+
seaborn.kdeplot(preds_on_negative, label="negative examples", bw=0.01)
97100
matplotlib.pyplot.ylabel("density of examples")
98101
matplotlib.pyplot.xlabel("model score")
99102
matplotlib.pyplot.show()
100103

104+
101105
def dual_density_plot_proba1(probs, istrue):
102106
"""plot a dual density plot of numeric prediction probs[:,1] against boolean istrue"""
103107
matplotlib.pyplot.gcf().clear()
104-
preds_on_positive = [ probs[i,1] for i in range(len(probs)) if istrue[i] ]
105-
preds_on_negative = [ probs[i,1] for i in range(len(probs)) if not istrue[i] ]
106-
seaborn.kdeplot(preds_on_positive, label = "positive examples", bw=0.01)
107-
seaborn.kdeplot(preds_on_negative , label = "negative examples", bw=0.01)
108+
preds_on_positive = [probs[i, 1] for i in range(len(probs)) if istrue[i]]
109+
preds_on_negative = [probs[i, 1] for i in range(len(probs)) if not istrue[i]]
110+
seaborn.kdeplot(preds_on_positive, label="positive examples", bw=0.01)
111+
seaborn.kdeplot(preds_on_negative, label="negative examples", bw=0.01)
108112
matplotlib.pyplot.ylabel("density of examples")
109113
matplotlib.pyplot.xlabel("model score")
110114
matplotlib.pyplot.show()
111115

116+
112117
def dual_hist_plot_proba1(probs, istrue):
113118
"""plot a dual histogram plot of numeric prediction probs[:,1] against boolean istrue"""
114119
matplotlib.pyplot.gcf().clear()
115-
pf = pandas.DataFrame({'prob' : [ probs[i,1] for i in range(probs.shape[0])], 'istrue' : istrue})
120+
pf = pandas.DataFrame(
121+
{"prob": [probs[i, 1] for i in range(probs.shape[0])], "istrue": istrue}
122+
)
116123
g = seaborn.FacetGrid(pf, row="istrue", height=4, aspect=3)
117124
bins = numpy.arange(0, 1.1, 0.1)
118-
g = g.map(matplotlib.pyplot.hist, "prob", bins=bins)
119-
#g = g.map(seaborn.distplot, "prob", bins=bins)
125+
g.map(matplotlib.pyplot.hist, "prob", bins=bins)
120126
matplotlib.pyplot.show()
121-
127+
122128

123129
def gain_curve_plot(prediction, outcome):
124130
"""plot cumulative outcome as a function of prediction order (descending)"""
125-
df = pandas.DataFrame({"prediction":prediction, "outcome":outcome})
131+
df = pandas.DataFrame({"prediction": prediction, "outcome": outcome})
126132
df.sort_values(["prediction"], ascending=[False], inplace=True)
127-
df["fraction_of_observations_by_prediction"] = [(1+i)/df.shape[0] for i in range(df.shape[0])]
133+
df["fraction_of_observations_by_prediction"] = [
134+
(1 + i) / df.shape[0] for i in range(df.shape[0])
135+
]
128136
df["cumulative_outcome"] = df["outcome"].cumsum()
129-
df["cumulative_outcome_fraction"] = df["cumulative_outcome"]/numpy.max(df["cumulative_outcome"])
130-
seaborn.scatterplot(x = "fraction_of_observations_by_prediction",
131-
y = "cumulative_outcome_fraction",
132-
data = df)
133-
seaborn.lineplot(x=[0,1],
134-
y=[0,1],
135-
color="red")
137+
df["cumulative_outcome_fraction"] = df["cumulative_outcome"] / numpy.max(
138+
df["cumulative_outcome"]
139+
)
140+
seaborn.scatterplot(
141+
x="fraction_of_observations_by_prediction",
142+
y="cumulative_outcome_fraction",
143+
data=df,
144+
)
145+
seaborn.lineplot(x=[0, 1], y=[0, 1], color="red")
146+
136147

137148
def lift_curve_plot(prediction, outcome):
138149
"""plot lift as a function of prediction order (descending)"""
139-
df = pandas.DataFrame({"prediction":prediction, "outcome":outcome})
150+
df = pandas.DataFrame({"prediction": prediction, "outcome": outcome})
140151
df.sort_values(["prediction"], ascending=[False], inplace=True)
141-
df["fraction_of_observations_by_prediction"] = [(1+i)/df.shape[0] for i in range(df.shape[0])]
152+
df["fraction_of_observations_by_prediction"] = [
153+
(1 + i) / df.shape[0] for i in range(df.shape[0])
154+
]
142155
df["cumulative_outcome"] = df["outcome"].cumsum()
143-
df["cumulative_outcome_fraction"] = df["cumulative_outcome"]/numpy.max(df["cumulative_outcome"])
144-
df["lift"] = df["cumulative_outcome_fraction"]/df["fraction_of_observations_by_prediction"]
145-
seaborn.scatterplot(x = "fraction_of_observations_by_prediction",
146-
y = "lift",
147-
data = df)
156+
df["cumulative_outcome_fraction"] = df["cumulative_outcome"] / numpy.max(
157+
df["cumulative_outcome"]
158+
)
159+
df["lift"] = (
160+
df["cumulative_outcome_fraction"] / df["fraction_of_observations_by_prediction"]
161+
)
162+
seaborn.scatterplot(x="fraction_of_observations_by_prediction", y="lift", data=df)
148163
matplotlib.pyplot.axhline(y=1, color="red")
149164

150165

151166
def dual_hist_plot(probs, istrue):
152167
"""plot a dual histogram plot of numeric prediction probs against boolean istrue"""
153168
matplotlib.pyplot.gcf().clear()
154-
pf = pandas.DataFrame({'prob' : [ probs[i] for i in range(probs.shape[0])], 'istrue' : istrue})
169+
pf = pandas.DataFrame(
170+
{"prob": [probs[i] for i in range(probs.shape[0])], "istrue": istrue}
171+
)
155172
g = seaborn.FacetGrid(pf, row="istrue", height=4, aspect=3)
156173
bins = numpy.arange(0, 1.1, 0.1)
157-
g = g.map(matplotlib.pyplot.hist, "prob", bins=bins)
158-
#g = g.map(seaborn.distplot, "prob", bins=bins)
174+
g.map(matplotlib.pyplot.hist, "prob", bins=bins)
159175
matplotlib.pyplot.show()
160176

161177

162178
# https://stackoverflow.com/questions/5228158/cartesian-product-of-a-dictionary-of-lists
163179
def search_grid(inp):
164180
"""build a cross product of all named dictionary entries"""
165181
gen = (dict(zip(inp.keys(), values)) for values in itertools.product(*inp.values()))
166-
return([ci for ci in gen])
182+
return [ci for ci in gen]
167183

168184

169185
def grid_to_df(grid):
170186
"""convert a search_grid list of maps to a pandas data frame"""
171187
n = len(grid)
172-
keys = [ ki for ki in grid[1].keys() ]
173-
return(pandas.DataFrame({ ki : [ grid[i][ki] for i in range(n) ] for ki in keys }))
188+
keys = [ki for ki in grid[1].keys()]
189+
return pandas.DataFrame({ki: [grid[i][ki] for i in range(n)] for ki in keys})
174190

175191

176192
def eval_fn_per_row(f, x2, df):
177193
"""evaluate f(row-as-map, x2) for rows in df"""
178-
return([ f({ k : df.loc[i, k] for k in df.columns }, x2) for i in range(df.shape[0]) ])
194+
return [f({k: df.loc[i, k] for k in df.columns}, x2) for i in range(df.shape[0])]
179195

180196

181-
def perm_score_vars(d, istrue, model, modelvars, k = 5):
197+
def perm_score_vars(d, istrue, model, modelvars, k=5):
182198
"""evaluate model~istrue on d permuting each of the modelvars and return variable importances"""
183199
d2 = d.copy()
184200
preds = model.predict_proba(d2[modelvars])
185201
basedev = mean_deviance(preds, istrue)
202+
186203
def perm_score_var(victim):
187204
dorig = numpy.array(d2[victim].copy())
188205
dnew = numpy.array(d2[victim].copy())
206+
189207
def perm_score_var_once():
190208
numpy.random.shuffle(dnew)
191209
d2[victim] = dnew
192-
preds = model.predict_proba(d2[modelvars])
193-
permdev = mean_deviance(preds, istrue)
194-
return(permdev)
195-
devs = [ perm_score_var_once() for rep in range(k) ]
196-
d2[victim] = dorig
197-
return(numpy.mean(devs), statistics.stdev(devs))
198-
stats = [ perm_score_var(victim) for victim in modelvars ]
199-
vf = pandas.DataFrame({"var" : modelvars})
200-
vf["importance"] = [ di[0] - basedev for di in stats ]
201-
vf["importance_dev"] = [ di[1] for di in stats ]
202-
vf.sort_values(by = ["importance"], ascending = False, inplace = True)
203-
return(vf)
210+
predsp = model.predict_proba(d2[modelvars])
211+
permdev = mean_deviance(predsp, istrue)
212+
return permdev
204213

214+
devs = [perm_score_var_once() for rep in range(k)]
215+
d2[victim] = dorig
216+
return numpy.mean(devs), statistics.stdev(devs)
217+
218+
stats = [perm_score_var(victim) for victim in modelvars]
219+
vf = pandas.DataFrame({"var": modelvars})
220+
vf["importance"] = [di[0] - basedev for di in stats]
221+
vf["importance_dev"] = [di[1] for di in stats]
222+
vf.sort_values(by=["importance"], ascending=False, inplace=True)
223+
return vf

pkg/dist/wvpy-0.1-py3-none-any.whl

-17 Bytes
Binary file not shown.

pkg/dist/wvpy-0.1.tar.gz

-2 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)