-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
239 additions
and
55 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import numpy | ||
|
||
import scipy.stats | ||
|
||
have_sklearn = False | ||
# noinspection PyBroadException | ||
try: | ||
import sklearn.linear_model | ||
|
||
have_sklearn = True | ||
except Exception: | ||
pass | ||
|
||
|
||
# methods to avoid calling statsmodels which seems to be incompatible with many | ||
# versions of other packages we need: | ||
# https://github.com/WinVector/pyvtreat/issues/14 | ||
|
||
|
||
def our_corr_score(*, y_true, y_pred): | ||
# compute Pearson correlation | ||
y_true = numpy.asarray(y_true) | ||
y_pred = numpy.asarray(y_pred) | ||
n = len(y_true) | ||
if n < 2: | ||
return 1, 1 | ||
if numpy.min(y_true) >= numpy.max(y_true): | ||
return 1, 1 | ||
if numpy.min(y_pred) >= numpy.max(y_pred): | ||
return 0, 1 | ||
r, sig = scipy.stats.pearsonr(y_true, y_pred) | ||
if n < 3: | ||
sig = 1 | ||
return r, sig | ||
|
||
|
||
# noinspection PyPep8Naming | ||
def our_pseudo_R2(*, y_true, y_pred): | ||
if not have_sklearn: | ||
cor, sig = our_corr_score(y_true=y_true, y_pred=y_pred) | ||
return cor**2, sig | ||
# compute Pearson correlation | ||
y_true = numpy.asarray(y_true) | ||
y_pred = numpy.asarray(y_pred) | ||
n = len(y_true) | ||
if n < 2: | ||
return 1, 1 | ||
if numpy.min(y_true) >= numpy.max(y_true): | ||
return 1, 1 | ||
if numpy.min(y_pred) >= numpy.max(y_pred): | ||
return 0, 1 | ||
|
||
fitter = sklearn.linear_model.LogisticRegression( | ||
penalty='l2', | ||
solver='lbfgs', | ||
fit_intercept=True, | ||
C=1000) | ||
fitter.fit(X=y_pred.reshape((n, 1)), y=y_true) | ||
preds = fitter.predict_proba(X=y_pred.reshape((n, 1)))[:, 1] | ||
eps = 1e-5 | ||
preds = numpy.minimum(preds, 1-eps) | ||
preds = numpy.maximum(preds, eps) | ||
deviance = -2 * numpy.sum(y_true * numpy.log(preds) +\ | ||
(1 - y_true) * numpy.log(1 - preds)) | ||
null_pred = numpy.zeros(n) + numpy.mean(y_true) | ||
null_deviance = -2 * numpy.sum(y_true * numpy.log(null_pred) +\ | ||
(1 - y_true) * numpy.log(1 - null_pred)) | ||
r2 = 1 - deviance/null_deviance | ||
sig = 1 | ||
|
||
if n >= 3: | ||
# https://github.com/WinVector/sigr/blob/master/R/ChiSqTest.R | ||
df_null = n - 1 | ||
df_residual = n - 2 | ||
delta_deviance = null_deviance - deviance | ||
delta_df = df_null - df_residual | ||
sig = 1 - scipy.stats.chi2.cdf(x=delta_deviance, df=delta_df) | ||
|
||
return r2, sig |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
|
||
import numpy | ||
|
||
import vtreat.stats_utils | ||
|
||
def test_linear_cor(): | ||
y_true = [1, 1, 0, 1, 0, 1, 1, 0, 1, 0] | ||
y_pred = [0.8, 1, 0.2, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5] | ||
cor, sig = vtreat.stats_utils.our_corr_score(y_true=y_true, y_pred=y_pred) | ||
# R: | ||
# y_true = c(1, 1, 0, 1, 0, 1, 1, 0, 1, 0) | ||
# y_pred = c(0.8, 1, 0.2, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5) | ||
# summary(lm(y_true ~ y_pred)) | ||
# Multiple R-squared: 0.5482, Adjusted R-squared: 0.4918 | ||
# F-statistic: 9.709 on 1 and 8 DF, p-value: 0.01432 | ||
assert numpy.abs(cor*cor - 0.5482) < 1.0e-2 | ||
assert numpy.abs(sig - 0.01432) < 1.0e-2 | ||
|
||
|
||
|
||
def test_logistic_r2(): | ||
if not vtreat.stats_utils.have_sklearn: | ||
return | ||
y_true = [1, 1, 0, 0, 0, 1, 1, 0, 1, 1] | ||
y_pred = [0.8, 1, 1, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5] | ||
# R: | ||
# y_true = c(1, 1, 0, 0, 0, 1, 1, 0, 1, 1) | ||
# y_pred = c(0.8, 1, 1, 0.5, 0.5, 0.8, 1, 0.2, 0.5, 0.5) | ||
# (s <- summary(glm(y_true ~ y_pred, family = binomial()))) | ||
# Null deviance: 13.460 on 9 degrees of freedom | ||
# Residual deviance: 11.762 on 8 degrees of freedom | ||
# (w <- sigr::wrapChiSqTest(s)) | ||
# Chi-Square Test summary: pseudo-R2=0.1262 (X2(1,N=10)=1.698, p=n.s.). | ||
# w$pValue | ||
# [1] 0.1925211 | ||
check_r2 = 1 - 11.762/13.460 | ||
r2, sig = vtreat.stats_utils.our_pseudo_R2(y_true=y_true, y_pred=y_pred) | ||
assert numpy.abs(r2 - check_r2) < 1.0e-2 | ||
assert numpy.abs(r2 - 0.1262) < 1.0e-2 | ||
assert numpy.abs(sig - 0.1925211) < 1.0e-2 | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,9 @@ | ||
numpy | ||
pandas | ||
scipy | ||
|
||
[all] | ||
sklearn | ||
|
||
[pseudoR2] | ||
sklearn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import numpy | ||
|
||
import scipy.stats | ||
|
||
have_sklearn = False | ||
# noinspection PyBroadException | ||
try: | ||
import sklearn.linear_model | ||
|
||
have_sklearn = True | ||
except Exception: | ||
pass | ||
|
||
|
||
# methods to avoid calling statsmodels which seems to be incompatible with many | ||
# versions of other packages we need: | ||
# https://github.com/WinVector/pyvtreat/issues/14 | ||
|
||
|
||
def our_corr_score(*, y_true, y_pred): | ||
# compute Pearson correlation | ||
y_true = numpy.asarray(y_true) | ||
y_pred = numpy.asarray(y_pred) | ||
n = len(y_true) | ||
if n < 2: | ||
return 1, 1 | ||
if numpy.min(y_true) >= numpy.max(y_true): | ||
return 1, 1 | ||
if numpy.min(y_pred) >= numpy.max(y_pred): | ||
return 0, 1 | ||
r, sig = scipy.stats.pearsonr(y_true, y_pred) | ||
if n < 3: | ||
sig = 1 | ||
return r, sig | ||
|
||
|
||
# noinspection PyPep8Naming | ||
def our_pseudo_R2(*, y_true, y_pred): | ||
if not have_sklearn: | ||
cor, sig = our_corr_score(y_true=y_true, y_pred=y_pred) | ||
return cor**2, sig | ||
# compute Pearson correlation | ||
y_true = numpy.asarray(y_true) | ||
y_pred = numpy.asarray(y_pred) | ||
n = len(y_true) | ||
if n < 2: | ||
return 1, 1 | ||
if numpy.min(y_true) >= numpy.max(y_true): | ||
return 1, 1 | ||
if numpy.min(y_pred) >= numpy.max(y_pred): | ||
return 0, 1 | ||
|
||
fitter = sklearn.linear_model.LogisticRegression( | ||
penalty='l2', | ||
solver='lbfgs', | ||
fit_intercept=True, | ||
C=1000) | ||
fitter.fit(X=y_pred.reshape((n, 1)), y=y_true) | ||
preds = fitter.predict_proba(X=y_pred.reshape((n, 1)))[:, 1] | ||
eps = 1e-5 | ||
preds = numpy.minimum(preds, 1-eps) | ||
preds = numpy.maximum(preds, eps) | ||
deviance = -2 * numpy.sum(y_true * numpy.log(preds) +\ | ||
(1 - y_true) * numpy.log(1 - preds)) | ||
null_pred = numpy.zeros(n) + numpy.mean(y_true) | ||
null_deviance = -2 * numpy.sum(y_true * numpy.log(null_pred) +\ | ||
(1 - y_true) * numpy.log(1 - null_pred)) | ||
r2 = 1 - deviance/null_deviance | ||
sig = 1 | ||
|
||
if n >= 3: | ||
# https://github.com/WinVector/sigr/blob/master/R/ChiSqTest.R | ||
df_null = n - 1 | ||
df_residual = n - 2 | ||
delta_deviance = null_deviance - deviance | ||
delta_df = df_null - df_residual | ||
sig = 1 - scipy.stats.chi2.cdf(x=delta_deviance, df=delta_df) | ||
|
||
return r2, sig |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters