Skip to content

Commit

Permalink
get significance calc working
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Mar 10, 2020
1 parent fb1537b commit d2e520e
Show file tree
Hide file tree
Showing 12 changed files with 239 additions and 55 deletions.
32 changes: 17 additions & 15 deletions coverage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,34 @@
platform darwin -- Python 3.7.5, pytest-5.2.4, py-1.8.0, pluggy-0.13.0
rootdir: /Users/johnmount/Documents/work/pyvtreat/pkg
plugins: cov-2.8.1
collected 15 items
collected 17 items

pkg/tests/test_classification.py .. [ 13%]
pkg/tests/test_col_name_issues.py ... [ 33%]
pkg/tests/test_imputation_controls.py . [ 40%]
pkg/tests/test_multinomial.py . [ 46%]
pkg/tests/test_nan_inf.py . [ 53%]
pkg/tests/test_outcome_name_required.py . [ 60%]
pkg/tests/test_r1_issue.py . [ 66%]
pkg/tests/test_range.py . [ 73%]
pkg/tests/test_regression.py . [ 80%]
pkg/tests/test_unsupervised.py . [ 86%]
pkg/tests/test_user_coders.py . [ 93%]
pkg/tests/test_classification.py .. [ 11%]
pkg/tests/test_col_name_issues.py ... [ 29%]
pkg/tests/test_imputation_controls.py . [ 35%]
pkg/tests/test_multinomial.py . [ 41%]
pkg/tests/test_nan_inf.py . [ 47%]
pkg/tests/test_outcome_name_required.py . [ 52%]
pkg/tests/test_r1_issue.py . [ 58%]
pkg/tests/test_range.py . [ 64%]
pkg/tests/test_regression.py . [ 70%]
pkg/tests/test_stats.py .. [ 82%]
pkg/tests/test_unsupervised.py . [ 88%]
pkg/tests/test_user_coders.py . [ 94%]
pkg/tests/test_util.py . [100%]

---------- coverage: platform darwin, python 3.7.5-final-0 -----------
Name Stmts Miss Cover
-----------------------------------------------
pkg/vtreat/__init__.py 6 0 100%
pkg/vtreat/cross_plan.py 50 11 78%
pkg/vtreat/stats_utils.py 53 11 79%
pkg/vtreat/transform.py 17 4 76%
pkg/vtreat/util.py 146 23 84%
pkg/vtreat/util.py 133 19 86%
pkg/vtreat/vtreat_api.py 227 47 79%
pkg/vtreat/vtreat_impl.py 581 83 86%
-----------------------------------------------
TOTAL 1027 168 84%
TOTAL 1067 175 84%


============================== 15 passed in 8.46s ==============================
============================= 17 passed in 10.91s ==============================
79 changes: 79 additions & 0 deletions pkg/build/lib/vtreat/stats_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy

import scipy.stats

have_sklearn = False
# noinspection PyBroadException
try:
import sklearn.linear_model

have_sklearn = True
except Exception:
pass


# methods to avoid calling statsmodels which seems to be incompatible with many
# versions of other packages we need:
# https://github.com/WinVector/pyvtreat/issues/14


def our_corr_score(*, y_true, y_pred):
# compute Pearson correlation
y_true = numpy.asarray(y_true)
y_pred = numpy.asarray(y_pred)
n = len(y_true)
if n < 2:
return 1, 1
if numpy.min(y_true) >= numpy.max(y_true):
return 1, 1
if numpy.min(y_pred) >= numpy.max(y_pred):
return 0, 1
r, sig = scipy.stats.pearsonr(y_true, y_pred)
if n < 3:
sig = 1
return r, sig


# noinspection PyPep8Naming
def our_pseudo_R2(*, y_true, y_pred):
if not have_sklearn:
cor, sig = our_corr_score(y_true=y_true, y_pred=y_pred)
return cor**2, sig
# compute Pearson correlation
y_true = numpy.asarray(y_true)
y_pred = numpy.asarray(y_pred)
n = len(y_true)
if n < 2:
return 1, 1
if numpy.min(y_true) >= numpy.max(y_true):
return 1, 1
if numpy.min(y_pred) >= numpy.max(y_pred):
return 0, 1

fitter = sklearn.linear_model.LogisticRegression(
penalty='l2',
solver='lbfgs',
fit_intercept=True,
C=1000)
fitter.fit(X=y_pred.reshape((n, 1)), y=y_true)
preds = fitter.predict_proba(X=y_pred.reshape((n, 1)))[:, 1]
eps = 1e-5
preds = numpy.minimum(preds, 1-eps)
preds = numpy.maximum(preds, eps)
deviance = -2 * numpy.sum(y_true * numpy.log(preds) +\
(1 - y_true) * numpy.log(1 - preds))
null_pred = numpy.zeros(n) + numpy.mean(y_true)
null_deviance = -2 * numpy.sum(y_true * numpy.log(null_pred) +\
(1 - y_true) * numpy.log(1 - null_pred))
r2 = 1 - deviance/null_deviance
sig = 1

if n >= 3:
# https://github.com/WinVector/sigr/blob/master/R/ChiSqTest.R
df_null = n - 1
df_residual = n - 2
delta_deviance = null_deviance - deviance
delta_df = df_null - df_residual
sig = 1 - scipy.stats.chi2.cdf(x=delta_deviance, df=delta_df)

return r2, sig
25 changes: 5 additions & 20 deletions pkg/build/lib/vtreat/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

import numpy
import pandas
import scipy.stats


import vtreat.stats_utils


def safe_to_numeric_array(x):
Expand Down Expand Up @@ -143,23 +145,6 @@ def grouped_by_x_statistics(x, y):
return sf


def our_corr_score(*, y_true, y_pred):
# compute Pearson correlation
y_true = numpy.asarray(y_true)
y_pred = numpy.asarray(y_pred)
n = len(y_true)
if n < 2:
return 1, 1
if numpy.min(y_true) >= numpy.max(y_true):
return 1, 1
if numpy.min(y_pred) >= numpy.max(y_pred):
return 0, 1
r, sig = scipy.stats.pearsonr(y_true, y_pred)
if n < 3:
sig = 1
return r, sig


def score_variables(cross_frame, variables, outcome,
*,
is_classification=False):
Expand All @@ -178,10 +163,10 @@ def f(v):
if (n > 2) and \
(numpy.max(col) > numpy.min(col)) and \
(numpy.max(outcome) > numpy.min(outcome)):
cor, sig = our_corr_score(y_true=outcome, y_pred=col)
cor, sig = vtreat.stats_utils.our_corr_score(y_true=outcome, y_pred=col)
r2 = cor**2
if is_classification:
pass # TODO: fix this up
r2, sig = vtreat.stats_utils.our_pseudo_R2(y_true=outcome, y_pred=col)
sfi = pandas.DataFrame(
{
"variable": [v],
Expand Down
Binary file modified pkg/dist/vtreat-0.4.2-py3-none-any.whl
Binary file not shown.
Binary file modified pkg/dist/vtreat-0.4.2.tar.gz
Binary file not shown.
4 changes: 4 additions & 0 deletions pkg/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
'pandas',
'scipy'
],
extras_require={
'pseudoR2': ['sklearn'],
'all': ['sklearn'],
},
platforms=['any'],
license='License :: OSI Approved :: BSD 3-clause License',
python_requires=">=3.5.3",
Expand Down
41 changes: 41 additions & 0 deletions pkg/tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@

import numpy

import vtreat.stats_utils

def test_linear_cor():
y_true = [1, 1, 0, 1, 0, 1, 1, 0, 1, 0]
y_pred = [0.8, 1, 0.2, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5]
cor, sig = vtreat.stats_utils.our_corr_score(y_true=y_true, y_pred=y_pred)
# R:
# y_true = c(1, 1, 0, 1, 0, 1, 1, 0, 1, 0)
# y_pred = c(0.8, 1, 0.2, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5)
# summary(lm(y_true ~ y_pred))
# Multiple R-squared: 0.5482, Adjusted R-squared: 0.4918
# F-statistic: 9.709 on 1 and 8 DF, p-value: 0.01432
assert numpy.abs(cor*cor - 0.5482) < 1.0e-2
assert numpy.abs(sig - 0.01432) < 1.0e-2



def test_logistic_r2():
if not vtreat.stats_utils.have_sklearn:
return
y_true = [1, 1, 0, 0, 0, 1, 1, 0, 1, 1]
y_pred = [0.8, 1, 1, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5]
# R:
# y_true = c(1, 1, 0, 0, 0, 1, 1, 0, 1, 1)
# y_pred = c(0.8, 1, 1, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5)
# (s <- summary(glm(y_true ~ y_pred, family = binomial())))
# Null deviance: 13.460 on 9 degrees of freedom
# Residual deviance: 11.762 on 8 degrees of freedom
# (w <- sigr::wrapChiSqTest(s))
# Chi-Square Test summary: pseudo-R2=0.1262 (X2(1,N=10)=1.698, p=n.s.).
# w$pValue
# [1] 0.1925211
check_r2 = 1 - 11.762/13.460
r2, sig = vtreat.stats_utils.our_pseudo_R2(y_true=y_true, y_pred=y_pred)
assert numpy.abs(r2 - check_r2) < 1.0e-2
assert numpy.abs(r2 - 0.1262) < 1.0e-2
assert numpy.abs(sig - 0.1925211) < 1.0e-2
pass
2 changes: 2 additions & 0 deletions pkg/vtreat.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,5 @@ Classifier: Programming Language :: Python :: 3.8
Classifier: License :: OSI Approved :: BSD License
Requires-Python: >=3.5.3
Description-Content-Type: text/markdown
Provides-Extra: pseudoR2
Provides-Extra: all
1 change: 1 addition & 0 deletions pkg/vtreat.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ README.md
setup.py
vtreat/__init__.py
vtreat/cross_plan.py
vtreat/stats_utils.py
vtreat/transform.py
vtreat/util.py
vtreat/vtreat_api.py
Expand Down
6 changes: 6 additions & 0 deletions pkg/vtreat.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
numpy
pandas
scipy

[all]
sklearn

[pseudoR2]
sklearn
79 changes: 79 additions & 0 deletions pkg/vtreat/stats_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import numpy

import scipy.stats

have_sklearn = False
# noinspection PyBroadException
try:
import sklearn.linear_model

have_sklearn = True
except Exception:
pass


# methods to avoid calling statsmodels which seems to be incompatible with many
# versions of other packages we need:
# https://github.com/WinVector/pyvtreat/issues/14


def our_corr_score(*, y_true, y_pred):
# compute Pearson correlation
y_true = numpy.asarray(y_true)
y_pred = numpy.asarray(y_pred)
n = len(y_true)
if n < 2:
return 1, 1
if numpy.min(y_true) >= numpy.max(y_true):
return 1, 1
if numpy.min(y_pred) >= numpy.max(y_pred):
return 0, 1
r, sig = scipy.stats.pearsonr(y_true, y_pred)
if n < 3:
sig = 1
return r, sig


# noinspection PyPep8Naming
def our_pseudo_R2(*, y_true, y_pred):
if not have_sklearn:
cor, sig = our_corr_score(y_true=y_true, y_pred=y_pred)
return cor**2, sig
# compute Pearson correlation
y_true = numpy.asarray(y_true)
y_pred = numpy.asarray(y_pred)
n = len(y_true)
if n < 2:
return 1, 1
if numpy.min(y_true) >= numpy.max(y_true):
return 1, 1
if numpy.min(y_pred) >= numpy.max(y_pred):
return 0, 1

fitter = sklearn.linear_model.LogisticRegression(
penalty='l2',
solver='lbfgs',
fit_intercept=True,
C=1000)
fitter.fit(X=y_pred.reshape((n, 1)), y=y_true)
preds = fitter.predict_proba(X=y_pred.reshape((n, 1)))[:, 1]
eps = 1e-5
preds = numpy.minimum(preds, 1-eps)
preds = numpy.maximum(preds, eps)
deviance = -2 * numpy.sum(y_true * numpy.log(preds) +\
(1 - y_true) * numpy.log(1 - preds))
null_pred = numpy.zeros(n) + numpy.mean(y_true)
null_deviance = -2 * numpy.sum(y_true * numpy.log(null_pred) +\
(1 - y_true) * numpy.log(1 - null_pred))
r2 = 1 - deviance/null_deviance
sig = 1

if n >= 3:
# https://github.com/WinVector/sigr/blob/master/R/ChiSqTest.R
df_null = n - 1
df_residual = n - 2
delta_deviance = null_deviance - deviance
delta_df = df_null - df_residual
sig = 1 - scipy.stats.chi2.cdf(x=delta_deviance, df=delta_df)

return r2, sig
25 changes: 5 additions & 20 deletions pkg/vtreat/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@

import numpy
import pandas
import scipy.stats


import vtreat.stats_utils


def safe_to_numeric_array(x):
Expand Down Expand Up @@ -143,23 +145,6 @@ def grouped_by_x_statistics(x, y):
return sf


def our_corr_score(*, y_true, y_pred):
# compute Pearson correlation
y_true = numpy.asarray(y_true)
y_pred = numpy.asarray(y_pred)
n = len(y_true)
if n < 2:
return 1, 1
if numpy.min(y_true) >= numpy.max(y_true):
return 1, 1
if numpy.min(y_pred) >= numpy.max(y_pred):
return 0, 1
r, sig = scipy.stats.pearsonr(y_true, y_pred)
if n < 3:
sig = 1
return r, sig


def score_variables(cross_frame, variables, outcome,
*,
is_classification=False):
Expand All @@ -178,10 +163,10 @@ def f(v):
if (n > 2) and \
(numpy.max(col) > numpy.min(col)) and \
(numpy.max(outcome) > numpy.min(outcome)):
cor, sig = our_corr_score(y_true=outcome, y_pred=col)
cor, sig = vtreat.stats_utils.our_corr_score(y_true=outcome, y_pred=col)
r2 = cor**2
if is_classification:
pass # TODO: fix this up
r2, sig = vtreat.stats_utils.our_pseudo_R2(y_true=outcome, y_pred=col)
sfi = pandas.DataFrame(
{
"variable": [v],
Expand Down

0 comments on commit d2e520e

Please sign in to comment.