Skip to content

Commit d560b56

Browse files
authored
Update fairlens dependencies
Update fairlens dependencies
2 parents 8c91c3c + 5b96b2a commit d560b56

File tree

6 files changed

+39
-24
lines changed

6 files changed

+39
-24
lines changed

setup.cfg

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ install_requires =
3737
matplotlib>=2.1.0
3838
seaborn>=0.11.1
3939
dcor>=0.5.3
40-
pyemd==0.5.1
4140

4241
[options.packages.find]
4342
where = src
@@ -53,6 +52,7 @@ dev =
5352
test =
5453
pytest>=6
5554
pytest-cov>=2
55+
pyemd>=1.0.0
5656

5757
doc =
5858
sphinx==3.5.4

src/fairlens/metrics/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
)
3030

3131
from .significance import ( # isort:skip
32-
binom_test,
32+
binomtest,
3333
binominal_proportion_p_value,
3434
binominal_proportion_interval,
3535
bootstrap_binned_statistic,
@@ -61,7 +61,7 @@
6161
"r2_mcfadden",
6262
"kruskal_wallis",
6363
"kruskal_wallis_boolean",
64-
"binom_test",
64+
"binomtest",
6565
"binominal_proportion_p_value",
6666
"binominal_proportion_interval",
6767
"bootstrap_binned_statistic",

src/fairlens/metrics/correlation.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,11 @@ def distance_nn_correlation(sr_a: pd.Series, sr_b: pd.Series) -> float:
184184
warnings.filterwarnings(action="ignore", category=UserWarning)
185185

186186
if sr_a.size < sr_b.size:
187-
sr_a = sr_a.append(pd.Series(sr_a.mean()).repeat(sr_b.size - sr_a.size), ignore_index=True)
187+
new_serie = pd.Series(sr_a.mean()).repeat(sr_b.size - sr_a.size)
188+
sr_a = pd.concat([sr_a, new_serie], ignore_index=True)
188189
elif sr_a.size > sr_b.size:
189-
sr_b = sr_b.append(pd.Series(sr_b.mean()).repeat(sr_a.size - sr_b.size), ignore_index=True)
190+
new_serie = pd.Series(sr_b.mean()).repeat(sr_a.size - sr_b.size)
191+
sr_b = pd.concat([sr_b, new_serie], ignore_index=True)
190192

191193
return dcor.distance_correlation(sr_a, sr_b)
192194

@@ -222,9 +224,11 @@ def distance_cn_correlation(sr_a: pd.Series, sr_b: pd.Series) -> float:
222224

223225
# Handle groups with a different number of elements.
224226
if sr_i.size < sr_j.size:
225-
sr_i = sr_i.append(sr_i.sample(sr_j.size - sr_i.size, replace=True), ignore_index=True)
227+
new_serie = sr_i.sample(sr_j.size - sr_i.size, replace=True)
228+
sr_i = pd.concat([sr_i, new_serie], ignore_index=True)
226229
elif sr_i.size > sr_j.size:
227-
sr_j = sr_j.append(sr_j.sample(sr_i.size - sr_j.size, replace=True), ignore_index=True)
230+
new_serie = sr_j.sample(sr_i.size - sr_j.size, replace=True)
231+
sr_j = pd.concat([sr_j, new_serie], ignore_index=True)
228232
total += dcor.distance_correlation(sr_i, sr_j)
229233

230234
total /= n * (n - 1) / 2

src/fairlens/metrics/distance.py

+24-14
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@
88

99
import numpy as np
1010
import pandas as pd
11-
import pyemd
1211
from scipy.spatial.distance import jensenshannon
13-
from scipy.stats import entropy, kruskal, ks_2samp
12+
from scipy.stats import entropy, kruskal, ks_2samp, wasserstein_distance
1413

1514
from .. import utils
1615
from ..metrics import significance as pv
@@ -304,19 +303,30 @@ class EarthMoversDistance(CategoricalDistanceMetric):
304303
"""
305304

306305
def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
307-
distance_matrix = 1 - np.eye(len(p))
308-
309-
if bin_edges is not None:
310-
# Use pair-wise euclidean distances between bin centers for scale data
311-
bin_centers = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0)
312-
xx, yy = np.meshgrid(bin_centers, bin_centers)
313-
distance_matrix = np.abs(xx - yy)
314-
315-
p = np.array(p).astype(np.float64)
316-
q = np.array(q).astype(np.float64)
317-
distance_matrix = distance_matrix.astype(np.float64)
306+
p_sum = p.sum()
307+
q_sum = q.sum()
308+
309+
if p_sum == 0 and q_sum == 0:
310+
return 0.0
311+
elif p_sum == 0 or q_sum == 0:
312+
return 1.0
313+
314+
# normalise counts for consistency with scipy.stats.wasserstein
315+
with np.errstate(divide="ignore", invalid="ignore"):
316+
p_normalised = np.nan_to_num(p / p_sum).astype(np.float64)
317+
q_normalised = np.nan_to_num(q / q_sum).astype(np.float64)
318+
319+
if bin_edges is None:
320+
# if bins not given, histograms are assumed to be counts of nominal categories,
321+
# and therefore distances betwen bins are meaningless. Set to all distances to
322+
# unity to model this.
323+
distance = 0.5 * np.sum(np.abs(p_normalised - q_normalised))
324+
else:
325+
# otherwise, use pair-wise euclidean distances between bin centers for scale data
326+
bin_centers = bin_edges[:-1] + np.diff(bin_edges) / 2.0
327+
distance = wasserstein_distance(bin_centers, bin_centers, u_weights=p_normalised, v_weights=q_normalised)
318328

319-
return pyemd.emd(p, q, distance_matrix)
329+
return distance
320330

321331
@property
322332
def id(self) -> str:

src/fairlens/metrics/significance.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
import numpy as np
1616
import pandas as pd
17-
from scipy.stats import beta, binom_test, norm
17+
from scipy.stats import beta, binomtest, norm
1818

1919

2020
def binominal_proportion_p_value(p_obs: float, p_null: float, n: int, alternative: str = "two-sided") -> float:
@@ -37,7 +37,8 @@ def binominal_proportion_p_value(p_obs: float, p_null: float, n: int, alternativ
3737
"""
3838

3939
k = np.ceil(p_obs * n)
40-
return binom_test(k, n, p_null, alternative)
40+
result = binomtest(k, n, p_null, alternative)
41+
return result.pvalue
4142

4243

4344
def binominal_proportion_interval(

tests/test_metrics.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def test_stat_distance_auto():
4545

4646
def test_auto_binning():
4747
res = emd_samples(group1, group2)
48-
assert stat_distance(df, target_attr, pred1, pred2, mode="emd")[0] == res
48+
assert np.isclose(res, stat_distance(df, target_attr, pred1, pred2, mode="emd")[0], atol=1e-4)
4949

5050

5151
def test_mean_distance():

0 commit comments

Comments
 (0)