Skip to content

Commit d2e520e

Browse files
committed
get significance calc working
1 parent fb1537b commit d2e520e

File tree

12 files changed

+239
-55
lines changed

12 files changed

+239
-55
lines changed

coverage.txt

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2,32 +2,34 @@
22
platform darwin -- Python 3.7.5, pytest-5.2.4, py-1.8.0, pluggy-0.13.0
33
rootdir: /Users/johnmount/Documents/work/pyvtreat/pkg
44
plugins: cov-2.8.1
5-
collected 15 items
5+
collected 17 items
66

7-
pkg/tests/test_classification.py .. [ 13%]
8-
pkg/tests/test_col_name_issues.py ... [ 33%]
9-
pkg/tests/test_imputation_controls.py . [ 40%]
10-
pkg/tests/test_multinomial.py . [ 46%]
11-
pkg/tests/test_nan_inf.py . [ 53%]
12-
pkg/tests/test_outcome_name_required.py . [ 60%]
13-
pkg/tests/test_r1_issue.py . [ 66%]
14-
pkg/tests/test_range.py . [ 73%]
15-
pkg/tests/test_regression.py . [ 80%]
16-
pkg/tests/test_unsupervised.py . [ 86%]
17-
pkg/tests/test_user_coders.py . [ 93%]
7+
pkg/tests/test_classification.py .. [ 11%]
8+
pkg/tests/test_col_name_issues.py ... [ 29%]
9+
pkg/tests/test_imputation_controls.py . [ 35%]
10+
pkg/tests/test_multinomial.py . [ 41%]
11+
pkg/tests/test_nan_inf.py . [ 47%]
12+
pkg/tests/test_outcome_name_required.py . [ 52%]
13+
pkg/tests/test_r1_issue.py . [ 58%]
14+
pkg/tests/test_range.py . [ 64%]
15+
pkg/tests/test_regression.py . [ 70%]
16+
pkg/tests/test_stats.py .. [ 82%]
17+
pkg/tests/test_unsupervised.py . [ 88%]
18+
pkg/tests/test_user_coders.py . [ 94%]
1819
pkg/tests/test_util.py . [100%]
1920

2021
---------- coverage: platform darwin, python 3.7.5-final-0 -----------
2122
Name Stmts Miss Cover
2223
-----------------------------------------------
2324
pkg/vtreat/__init__.py 6 0 100%
2425
pkg/vtreat/cross_plan.py 50 11 78%
26+
pkg/vtreat/stats_utils.py 53 11 79%
2527
pkg/vtreat/transform.py 17 4 76%
26-
pkg/vtreat/util.py 146 23 84%
28+
pkg/vtreat/util.py 133 19 86%
2729
pkg/vtreat/vtreat_api.py 227 47 79%
2830
pkg/vtreat/vtreat_impl.py 581 83 86%
2931
-----------------------------------------------
30-
TOTAL 1027 168 84%
32+
TOTAL 1067 175 84%
3133

3234

33-
============================== 15 passed in 8.46s ==============================
35+
============================= 17 passed in 10.91s ==============================
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import numpy
2+
3+
import scipy.stats
4+
5+
have_sklearn = False
6+
# noinspection PyBroadException
7+
try:
8+
import sklearn.linear_model
9+
10+
have_sklearn = True
11+
except Exception:
12+
pass
13+
14+
15+
# methods to avoid calling statsmodels which seems to be incompatible with many
16+
# versions of other packages we need:
17+
# https://github.com/WinVector/pyvtreat/issues/14
18+
19+
20+
def our_corr_score(*, y_true, y_pred):
21+
# compute Pearson correlation
22+
y_true = numpy.asarray(y_true)
23+
y_pred = numpy.asarray(y_pred)
24+
n = len(y_true)
25+
if n < 2:
26+
return 1, 1
27+
if numpy.min(y_true) >= numpy.max(y_true):
28+
return 1, 1
29+
if numpy.min(y_pred) >= numpy.max(y_pred):
30+
return 0, 1
31+
r, sig = scipy.stats.pearsonr(y_true, y_pred)
32+
if n < 3:
33+
sig = 1
34+
return r, sig
35+
36+
37+
# noinspection PyPep8Naming
38+
def our_pseudo_R2(*, y_true, y_pred):
39+
if not have_sklearn:
40+
cor, sig = our_corr_score(y_true=y_true, y_pred=y_pred)
41+
return cor**2, sig
42+
# compute Pearson correlation
43+
y_true = numpy.asarray(y_true)
44+
y_pred = numpy.asarray(y_pred)
45+
n = len(y_true)
46+
if n < 2:
47+
return 1, 1
48+
if numpy.min(y_true) >= numpy.max(y_true):
49+
return 1, 1
50+
if numpy.min(y_pred) >= numpy.max(y_pred):
51+
return 0, 1
52+
53+
fitter = sklearn.linear_model.LogisticRegression(
54+
penalty='l2',
55+
solver='lbfgs',
56+
fit_intercept=True,
57+
C=1000)
58+
fitter.fit(X=y_pred.reshape((n, 1)), y=y_true)
59+
preds = fitter.predict_proba(X=y_pred.reshape((n, 1)))[:, 1]
60+
eps = 1e-5
61+
preds = numpy.minimum(preds, 1-eps)
62+
preds = numpy.maximum(preds, eps)
63+
deviance = -2 * numpy.sum(y_true * numpy.log(preds) +\
64+
(1 - y_true) * numpy.log(1 - preds))
65+
null_pred = numpy.zeros(n) + numpy.mean(y_true)
66+
null_deviance = -2 * numpy.sum(y_true * numpy.log(null_pred) +\
67+
(1 - y_true) * numpy.log(1 - null_pred))
68+
r2 = 1 - deviance/null_deviance
69+
sig = 1
70+
71+
if n >= 3:
72+
# https://github.com/WinVector/sigr/blob/master/R/ChiSqTest.R
73+
df_null = n - 1
74+
df_residual = n - 2
75+
delta_deviance = null_deviance - deviance
76+
delta_df = df_null - df_residual
77+
sig = 1 - scipy.stats.chi2.cdf(x=delta_deviance, df=delta_df)
78+
79+
return r2, sig

pkg/build/lib/vtreat/util.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010

1111
import numpy
1212
import pandas
13-
import scipy.stats
13+
14+
15+
import vtreat.stats_utils
1416

1517

1618
def safe_to_numeric_array(x):
@@ -143,23 +145,6 @@ def grouped_by_x_statistics(x, y):
143145
return sf
144146

145147

146-
def our_corr_score(*, y_true, y_pred):
147-
# compute Pearson correlation
148-
y_true = numpy.asarray(y_true)
149-
y_pred = numpy.asarray(y_pred)
150-
n = len(y_true)
151-
if n < 2:
152-
return 1, 1
153-
if numpy.min(y_true) >= numpy.max(y_true):
154-
return 1, 1
155-
if numpy.min(y_pred) >= numpy.max(y_pred):
156-
return 0, 1
157-
r, sig = scipy.stats.pearsonr(y_true, y_pred)
158-
if n < 3:
159-
sig = 1
160-
return r, sig
161-
162-
163148
def score_variables(cross_frame, variables, outcome,
164149
*,
165150
is_classification=False):
@@ -178,10 +163,10 @@ def f(v):
178163
if (n > 2) and \
179164
(numpy.max(col) > numpy.min(col)) and \
180165
(numpy.max(outcome) > numpy.min(outcome)):
181-
cor, sig = our_corr_score(y_true=outcome, y_pred=col)
166+
cor, sig = vtreat.stats_utils.our_corr_score(y_true=outcome, y_pred=col)
182167
r2 = cor**2
183168
if is_classification:
184-
pass # TODO: fix this up
169+
r2, sig = vtreat.stats_utils.our_pseudo_R2(y_true=outcome, y_pred=col)
185170
sfi = pandas.DataFrame(
186171
{
187172
"variable": [v],
891 Bytes
Binary file not shown.

pkg/dist/vtreat-0.4.2.tar.gz

594 Bytes
Binary file not shown.

pkg/setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@
6262
'pandas',
6363
'scipy'
6464
],
65+
extras_require={
66+
'pseudoR2': ['sklearn'],
67+
'all': ['sklearn'],
68+
},
6569
platforms=['any'],
6670
license='License :: OSI Approved :: BSD 3-clause License',
6771
python_requires=">=3.5.3",

pkg/tests/test_stats.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
2+
import numpy
3+
4+
import vtreat.stats_utils
5+
6+
def test_linear_cor():
7+
y_true = [1, 1, 0, 1, 0, 1, 1, 0, 1, 0]
8+
y_pred = [0.8, 1, 0.2, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5]
9+
cor, sig = vtreat.stats_utils.our_corr_score(y_true=y_true, y_pred=y_pred)
10+
# R:
11+
# y_true = c(1, 1, 0, 1, 0, 1, 1, 0, 1, 0)
12+
# y_pred = c(0.8, 1, 0.2, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5)
13+
# summary(lm(y_true ~ y_pred))
14+
# Multiple R-squared: 0.5482, Adjusted R-squared: 0.4918
15+
# F-statistic: 9.709 on 1 and 8 DF, p-value: 0.01432
16+
assert numpy.abs(cor*cor - 0.5482) < 1.0e-2
17+
assert numpy.abs(sig - 0.01432) < 1.0e-2
18+
19+
20+
21+
def test_logistic_r2():
22+
if not vtreat.stats_utils.have_sklearn:
23+
return
24+
y_true = [1, 1, 0, 0, 0, 1, 1, 0, 1, 1]
25+
y_pred = [0.8, 1, 1, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5]
26+
# R:
27+
# y_true = c(1, 1, 0, 0, 0, 1, 1, 0, 1, 1)
28+
# y_pred = c(0.8, 1, 1, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5)
29+
# (s <- summary(glm(y_true ~ y_pred, family = binomial())))
30+
# Null deviance: 13.460 on 9 degrees of freedom
31+
# Residual deviance: 11.762 on 8 degrees of freedom
32+
# (w <- sigr::wrapChiSqTest(s))
33+
# Chi-Square Test summary: pseudo-R2=0.1262 (X2(1,N=10)=1.698, p=n.s.).
34+
# w$pValue
35+
# [1] 0.1925211
36+
check_r2 = 1 - 11.762/13.460
37+
r2, sig = vtreat.stats_utils.our_pseudo_R2(y_true=y_true, y_pred=y_pred)
38+
assert numpy.abs(r2 - check_r2) < 1.0e-2
39+
assert numpy.abs(r2 - 0.1262) < 1.0e-2
40+
assert numpy.abs(sig - 0.1925211) < 1.0e-2
41+
pass

pkg/vtreat.egg-info/PKG-INFO

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,5 @@ Classifier: Programming Language :: Python :: 3.8
6262
Classifier: License :: OSI Approved :: BSD License
6363
Requires-Python: >=3.5.3
6464
Description-Content-Type: text/markdown
65+
Provides-Extra: pseudoR2
66+
Provides-Extra: all

pkg/vtreat.egg-info/SOURCES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ README.md
22
setup.py
33
vtreat/__init__.py
44
vtreat/cross_plan.py
5+
vtreat/stats_utils.py
56
vtreat/transform.py
67
vtreat/util.py
78
vtreat/vtreat_api.py

pkg/vtreat.egg-info/requires.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
11
numpy
22
pandas
33
scipy
4+
5+
[all]
6+
sklearn
7+
8+
[pseudoR2]
9+
sklearn

0 commit comments

Comments
 (0)