1-
21import numpy
32import statistics
43import matplotlib
109import pandas
1110
1211
13-
1412def cross_predict_model (fitter , X , Y , plan ):
1513 """train a model Y~X using the cross validation plan and return predictions"""
16- preds = [None ]* X .shape [0 ]
14+ preds = [None ] * X .shape [0 ]
1715 for g in range (len (plan )):
1816 pi = plan [g ]
1917 model = fitter .fit (X .iloc [pi ["train" ]], Y .iloc [pi ["train" ]])
2018 predg = model .predict (X .iloc [pi ["test" ]])
2119 for i in range (len (pi ["test" ])):
2220 preds [pi ["test" ][i ]] = predg [i ]
23- return ( preds )
21+ return preds
2422
2523
2624def cross_predict_model_prob (fitter , X , Y , plan ):
@@ -31,38 +29,39 @@ def cross_predict_model_prob(fitter, X, Y, plan):
3129 model = fitter .fit (X .iloc [pi ["train" ]], Y .iloc [pi ["train" ]])
3230 predg = model .predict_proba (X .iloc [pi ["test" ]])
3331 for i in range (len (pi ["test" ])):
34- preds [pi ["test" ][i ],0 ] = predg [i ,0 ]
35- preds [pi ["test" ][i ],1 ] = predg [i ,1 ]
36- return (preds )
37-
38-
32+ preds [pi ["test" ][i ], 0 ] = predg [i , 0 ]
33+ preds [pi ["test" ][i ], 1 ] = predg [i , 1 ]
34+ return preds
3935
4036
4137def mean_deviance (predictions , istrue ):
4238 """compute per-row deviance of predictions versus istrue"""
43- mass_on_correct = [ predictions [i ,1 ] if istrue [i ] else predictions [i ,0 ] for i in range (len (istrue )) ]
44- return (- 2 * sum (numpy .log (mass_on_correct ))/ len (istrue ))
45-
39+ mass_on_correct = [
40+ predictions [i , 1 ] if istrue [i ] else predictions [i , 0 ]
41+ for i in range (len (istrue ))
42+ ]
43+ return - 2 * sum (numpy .log (mass_on_correct )) / len (istrue )
4644
4745
4846def mean_null_deviance (istrue ):
4947 """compute per-row nulll deviance of predictions versus istrue"""
5048 p = numpy .mean (istrue )
51- mass_on_correct = [ p if istrue [i ] else 1 - p for i in range (len (istrue )) ]
52- return (- 2 * sum (numpy .log (mass_on_correct ))/ len (istrue ))
53-
49+ mass_on_correct = [p if istrue [i ] else 1 - p for i in range (len (istrue ))]
50+ return - 2 * sum (numpy .log (mass_on_correct )) / len (istrue )
5451
5552
5653def mk_cross_plan (n , k ):
5754 """randomly split range(n) into k disjoint groups"""
5855 grp = [i % k for i in range (n )]
5956 numpy .random .shuffle (grp )
60- plan = [ { "train" : [i for i in range (n ) if grp [i ] != j ],
61- "test" : [i for i in range (n ) if grp [i ] == j ] } for j in range (k ) ]
62- return (plan )
63-
64-
65-
57+ plan = [
58+ {
59+ "train" : [i for i in range (n ) if grp [i ] != j ],
60+ "test" : [i for i in range (n ) if grp [i ] == j ],
61+ }
62+ for j in range (k )
63+ ]
64+ return plan
6665
6766
6867# https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
@@ -73,132 +72,152 @@ def plot_roc(prediction, istrue):
7372 matplotlib .pyplot .figure ()
7473 lw = 2
7574 matplotlib .pyplot .gcf ().clear ()
76- matplotlib .pyplot .plot (fpr , tpr , color = 'darkorange' ,
77- lw = lw ,
78- label = 'ROC curve (area = {0:0.2f})'
79- '' .format (auc ))
80- matplotlib .pyplot .plot ([0 , 1 ], [0 , 1 ], color = 'navy' , lw = lw , linestyle = '--' )
75+ matplotlib .pyplot .plot (
76+ fpr ,
77+ tpr ,
78+ color = "darkorange" ,
79+ lw = lw ,
80+ label = "ROC curve (area = {0:0.2f})" "" .format (auc ),
81+ )
82+ matplotlib .pyplot .plot ([0 , 1 ], [0 , 1 ], color = "navy" , lw = lw , linestyle = "--" )
8183 matplotlib .pyplot .xlim ([0.0 , 1.0 ])
8284 matplotlib .pyplot .ylim ([0.0 , 1.05 ])
83- matplotlib .pyplot .xlabel (' False Positive Rate' )
84- matplotlib .pyplot .ylabel (' True Positive Rate' )
85- matplotlib .pyplot .title (' Receiver operating characteristic example' )
85+ matplotlib .pyplot .xlabel (" False Positive Rate" )
86+ matplotlib .pyplot .ylabel (" True Positive Rate" )
87+ matplotlib .pyplot .title (" Receiver operating characteristic example" )
8688 matplotlib .pyplot .legend (loc = "lower right" )
8789 matplotlib .pyplot .show ()
88- return (auc )
90+ return auc
91+
8992
9093def dual_density_plot (probs , istrue ):
9194 """plot a dual density plot of numeric prediction probs against boolean istrue"""
9295 matplotlib .pyplot .gcf ().clear ()
93- preds_on_positive = [ probs [i ] for i in range (len (probs )) if istrue [i ] ]
94- preds_on_negative = [ probs [i ] for i in range (len (probs )) if not istrue [i ] ]
95- seaborn .kdeplot (preds_on_positive , label = "positive examples" , bw = 0.01 )
96- seaborn .kdeplot (preds_on_negative , label = "negative examples" , bw = 0.01 )
96+ preds_on_positive = [probs [i ] for i in range (len (probs )) if istrue [i ]]
97+ preds_on_negative = [probs [i ] for i in range (len (probs )) if not istrue [i ]]
98+ seaborn .kdeplot (preds_on_positive , label = "positive examples" , bw = 0.01 )
99+ seaborn .kdeplot (preds_on_negative , label = "negative examples" , bw = 0.01 )
97100 matplotlib .pyplot .ylabel ("density of examples" )
98101 matplotlib .pyplot .xlabel ("model score" )
99102 matplotlib .pyplot .show ()
100103
104+
101105def dual_density_plot_proba1 (probs , istrue ):
102106 """plot a dual density plot of numeric prediction probs[:,1] against boolean istrue"""
103107 matplotlib .pyplot .gcf ().clear ()
104- preds_on_positive = [ probs [i ,1 ] for i in range (len (probs )) if istrue [i ] ]
105- preds_on_negative = [ probs [i ,1 ] for i in range (len (probs )) if not istrue [i ] ]
106- seaborn .kdeplot (preds_on_positive , label = "positive examples" , bw = 0.01 )
107- seaborn .kdeplot (preds_on_negative , label = "negative examples" , bw = 0.01 )
108+ preds_on_positive = [probs [i , 1 ] for i in range (len (probs )) if istrue [i ]]
109+ preds_on_negative = [probs [i , 1 ] for i in range (len (probs )) if not istrue [i ]]
110+ seaborn .kdeplot (preds_on_positive , label = "positive examples" , bw = 0.01 )
111+ seaborn .kdeplot (preds_on_negative , label = "negative examples" , bw = 0.01 )
108112 matplotlib .pyplot .ylabel ("density of examples" )
109113 matplotlib .pyplot .xlabel ("model score" )
110114 matplotlib .pyplot .show ()
111115
116+
112117def dual_hist_plot_proba1 (probs , istrue ):
113118 """plot a dual histogram plot of numeric prediction probs[:,1] against boolean istrue"""
114119 matplotlib .pyplot .gcf ().clear ()
115- pf = pandas .DataFrame ({'prob' : [ probs [i ,1 ] for i in range (probs .shape [0 ])], 'istrue' : istrue })
120+ pf = pandas .DataFrame (
121+ {"prob" : [probs [i , 1 ] for i in range (probs .shape [0 ])], "istrue" : istrue }
122+ )
116123 g = seaborn .FacetGrid (pf , row = "istrue" , height = 4 , aspect = 3 )
117124 bins = numpy .arange (0 , 1.1 , 0.1 )
118- g = g .map (matplotlib .pyplot .hist , "prob" , bins = bins )
119- #g = g.map(seaborn.distplot, "prob", bins=bins)
125+ g .map (matplotlib .pyplot .hist , "prob" , bins = bins )
120126 matplotlib .pyplot .show ()
121-
127+
122128
123129def gain_curve_plot (prediction , outcome ):
124130 """plot cumulative outcome as a function of prediction order (descending)"""
125- df = pandas .DataFrame ({"prediction" :prediction , "outcome" :outcome })
131+ df = pandas .DataFrame ({"prediction" : prediction , "outcome" : outcome })
126132 df .sort_values (["prediction" ], ascending = [False ], inplace = True )
127- df ["fraction_of_observations_by_prediction" ] = [(1 + i )/ df .shape [0 ] for i in range (df .shape [0 ])]
133+ df ["fraction_of_observations_by_prediction" ] = [
134+ (1 + i ) / df .shape [0 ] for i in range (df .shape [0 ])
135+ ]
128136 df ["cumulative_outcome" ] = df ["outcome" ].cumsum ()
129- df ["cumulative_outcome_fraction" ] = df ["cumulative_outcome" ]/ numpy .max (df ["cumulative_outcome" ])
130- seaborn .scatterplot (x = "fraction_of_observations_by_prediction" ,
131- y = "cumulative_outcome_fraction" ,
132- data = df )
133- seaborn .lineplot (x = [0 ,1 ],
134- y = [0 ,1 ],
135- color = "red" )
137+ df ["cumulative_outcome_fraction" ] = df ["cumulative_outcome" ] / numpy .max (
138+ df ["cumulative_outcome" ]
139+ )
140+ seaborn .scatterplot (
141+ x = "fraction_of_observations_by_prediction" ,
142+ y = "cumulative_outcome_fraction" ,
143+ data = df ,
144+ )
145+ seaborn .lineplot (x = [0 , 1 ], y = [0 , 1 ], color = "red" )
146+
136147
137148def lift_curve_plot (prediction , outcome ):
138149 """plot lift as a function of prediction order (descending)"""
139- df = pandas .DataFrame ({"prediction" :prediction , "outcome" :outcome })
150+ df = pandas .DataFrame ({"prediction" : prediction , "outcome" : outcome })
140151 df .sort_values (["prediction" ], ascending = [False ], inplace = True )
141- df ["fraction_of_observations_by_prediction" ] = [(1 + i )/ df .shape [0 ] for i in range (df .shape [0 ])]
152+ df ["fraction_of_observations_by_prediction" ] = [
153+ (1 + i ) / df .shape [0 ] for i in range (df .shape [0 ])
154+ ]
142155 df ["cumulative_outcome" ] = df ["outcome" ].cumsum ()
143- df ["cumulative_outcome_fraction" ] = df ["cumulative_outcome" ]/ numpy .max (df ["cumulative_outcome" ])
144- df ["lift" ] = df ["cumulative_outcome_fraction" ]/ df ["fraction_of_observations_by_prediction" ]
145- seaborn .scatterplot (x = "fraction_of_observations_by_prediction" ,
146- y = "lift" ,
147- data = df )
156+ df ["cumulative_outcome_fraction" ] = df ["cumulative_outcome" ] / numpy .max (
157+ df ["cumulative_outcome" ]
158+ )
159+ df ["lift" ] = (
160+ df ["cumulative_outcome_fraction" ] / df ["fraction_of_observations_by_prediction" ]
161+ )
162+ seaborn .scatterplot (x = "fraction_of_observations_by_prediction" , y = "lift" , data = df )
148163 matplotlib .pyplot .axhline (y = 1 , color = "red" )
149164
150165
151166def dual_hist_plot (probs , istrue ):
152167 """plot a dual histogram plot of numeric prediction probs against boolean istrue"""
153168 matplotlib .pyplot .gcf ().clear ()
154- pf = pandas .DataFrame ({'prob' : [ probs [i ] for i in range (probs .shape [0 ])], 'istrue' : istrue })
169+ pf = pandas .DataFrame (
170+ {"prob" : [probs [i ] for i in range (probs .shape [0 ])], "istrue" : istrue }
171+ )
155172 g = seaborn .FacetGrid (pf , row = "istrue" , height = 4 , aspect = 3 )
156173 bins = numpy .arange (0 , 1.1 , 0.1 )
157- g = g .map (matplotlib .pyplot .hist , "prob" , bins = bins )
158- #g = g.map(seaborn.distplot, "prob", bins=bins)
174+ g .map (matplotlib .pyplot .hist , "prob" , bins = bins )
159175 matplotlib .pyplot .show ()
160176
161177
162178# https://stackoverflow.com/questions/5228158/cartesian-product-of-a-dictionary-of-lists
163179def search_grid (inp ):
164180 """build a cross product of all named dictionary entries"""
165181 gen = (dict (zip (inp .keys (), values )) for values in itertools .product (* inp .values ()))
166- return ( [ci for ci in gen ])
182+ return [ci for ci in gen ]
167183
168184
169185def grid_to_df (grid ):
170186 """convert a search_grid list of maps to a pandas data frame"""
171187 n = len (grid )
172- keys = [ ki for ki in grid [1 ].keys () ]
173- return ( pandas .DataFrame ({ ki : [ grid [i ][ki ] for i in range (n ) ] for ki in keys }) )
188+ keys = [ki for ki in grid [1 ].keys ()]
189+ return pandas .DataFrame ({ki : [grid [i ][ki ] for i in range (n )] for ki in keys } )
174190
175191
176192def eval_fn_per_row (f , x2 , df ):
177193 """evaluate f(row-as-map, x2) for rows in df"""
178- return ([ f ({ k : df .loc [i , k ] for k in df .columns }, x2 ) for i in range (df .shape [0 ]) ])
194+ return [ f ({k : df .loc [i , k ] for k in df .columns }, x2 ) for i in range (df .shape [0 ])]
179195
180196
181- def perm_score_vars (d , istrue , model , modelvars , k = 5 ):
197+ def perm_score_vars (d , istrue , model , modelvars , k = 5 ):
182198 """evaluate model~istrue on d permuting each of the modelvars and return variable importances"""
183199 d2 = d .copy ()
184200 preds = model .predict_proba (d2 [modelvars ])
185201 basedev = mean_deviance (preds , istrue )
202+
186203 def perm_score_var (victim ):
187204 dorig = numpy .array (d2 [victim ].copy ())
188205 dnew = numpy .array (d2 [victim ].copy ())
206+
189207 def perm_score_var_once ():
190208 numpy .random .shuffle (dnew )
191209 d2 [victim ] = dnew
192- preds = model .predict_proba (d2 [modelvars ])
193- permdev = mean_deviance (preds , istrue )
194- return (permdev )
195- devs = [ perm_score_var_once () for rep in range (k ) ]
196- d2 [victim ] = dorig
197- return (numpy .mean (devs ), statistics .stdev (devs ))
198- stats = [ perm_score_var (victim ) for victim in modelvars ]
199- vf = pandas .DataFrame ({"var" : modelvars })
200- vf ["importance" ] = [ di [0 ] - basedev for di in stats ]
201- vf ["importance_dev" ] = [ di [1 ] for di in stats ]
202- vf .sort_values (by = ["importance" ], ascending = False , inplace = True )
203- return (vf )
210+ predsp = model .predict_proba (d2 [modelvars ])
211+ permdev = mean_deviance (predsp , istrue )
212+ return permdev
204213
214+ devs = [perm_score_var_once () for rep in range (k )]
215+ d2 [victim ] = dorig
216+ return numpy .mean (devs ), statistics .stdev (devs )
217+
218+ stats = [perm_score_var (victim ) for victim in modelvars ]
219+ vf = pandas .DataFrame ({"var" : modelvars })
220+ vf ["importance" ] = [di [0 ] - basedev for di in stats ]
221+ vf ["importance_dev" ] = [di [1 ] for di in stats ]
222+ vf .sort_values (by = ["importance" ], ascending = False , inplace = True )
223+ return vf
0 commit comments