Skip to content

[ENH] Support sparse Jaccard #3657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Mar 15, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import sklearn.metrics as skl_metrics
from scipy.sparse import issparse, csr_matrix

from Orange.data import Table, Domain, Instance, RowInstance
from Orange.misc import DistMatrix
Expand All @@ -13,6 +14,7 @@
# TODO this *private* function is called from several widgets to prepare
# data for calling the below classes. After we (mostly) stopped relying
# on sklearn.metrics, this is (mostly) unnecessary

def _preprocess(table, impute=True):
"""Remove categorical attributes and impute missing values."""
if not len(table):
Expand Down Expand Up @@ -499,3 +501,51 @@ def __call__(self, e1, e2=None, axis=1, impute=False):
else:
dist_matrix = DistMatrix(dist)
return dist_matrix

class SparseJaccard:
"""
Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is
sparse, data normalized with intersection/union. Sklearn's function can't
handle discrete or missing data and normalization.
"""

def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if axis == 0:
x1 = x1.T
if x2 is not None:
x2 = x2.T
if issparse(x1):
dist = self.sparse_jaccard(x1, x2)
else:
dist = skl_metrics.pairwise.pairwise_distances(x1,
x2,
metric="jaccard")
if impute and np.isnan(dist).any():
dist = np.nan_to_num(dist)
if isinstance(e1, (Table, RowInstance)):
dist_matrix = DistMatrix(dist, e1, e2, axis)
else:
dist_matrix = DistMatrix(dist)
return dist_matrix

def sparse_jaccard(self, x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
x1 = csr_matrix(x1)
x1.eliminate_zeros()
x2 = csr_matrix(x2)
x2.eliminate_zeros()
n, m = x1.shape[0], x2.shape[0]
matrix = np.zeros((n, m))
for i in range(n):
xi_ind = set(x1[i].indices)
for j in range(i if symmetric else m):
jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
/ len(set(x1[i].indices).union(x1[j].indices))
matrix[i, j] = jacc
if symmetric:
matrix[j, i] = jacc
return matrix
6 changes: 3 additions & 3 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from Orange.statistics import util

from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
SklDistance, _orange_to_numpy)
SklDistance, _orange_to_numpy, SparseJaccard)

class EuclideanRowsModel(FittedDistanceModel):
"""
Expand Down Expand Up @@ -416,9 +416,9 @@ def compute_distances(self, x1, x2):


class Jaccard(FittedDistance):
supports_sparse = False
supports_sparse = True
supports_discrete = True
fallback = SklDistance('jaccard')
fallback = SparseJaccard()
ModelType = JaccardModel

def fit_rows(self, attributes, x, n_vals):
Expand Down
20 changes: 17 additions & 3 deletions Orange/distance/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,13 @@ def test_no_data(self):

def test_sparse(self):
"""Test sparse support in distances."""
sparse_iris = csr_matrix(Table('iris').X)
if not self.Distance.supports_sparse:
self.assertRaises(TypeError, self.Distance, sparse_iris)
self.assertRaises(TypeError, self.Distance, self.sparse_data)
else:
self.Distance(sparse_iris)
# check the result is the same as for dense
dist_numpy = self.Distance(self.dense_X)
dist_sparse = self.Distance(self.sparse_data)
np.testing.assert_allclose(dist_sparse, dist_numpy)


class CommonFittedTests(CommonTests):
Expand Down Expand Up @@ -144,6 +146,12 @@ def setUp(self):
self.mixed_data = self.data = Table.from_numpy(
self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))

self.dense_X = np.array([[1, 0, 2],
[-1, 5, 0],
[0, 1, 1],
[7, 0, 0]])
self.sparse_data = Table(csr_matrix(self.dense_X))



# Correct results in these tests were computed manually or with Excel;
Expand Down Expand Up @@ -838,6 +846,12 @@ def setUp(self):
[1, 0, 1],
[1, 0, 0]])

self.dense_X = np.array([[1, 0, 2],
[-1, 5, 0],
[0, 1, 1],
[7, 0, 0]])
self.sparse_data = Table(csr_matrix(self.dense_X))

def test_jaccard_rows(self):
assert_almost_equal = np.testing.assert_almost_equal

Expand Down
24 changes: 15 additions & 9 deletions Orange/widgets/unsupervised/owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ def _check_sparse():
def _fix_discrete():
nonlocal data
if data.domain.has_discrete_attributes() and (
issparse(data.X) and getattr(metric, "fallback", None)
issparse(data.X) and getattr(metric, "fallback",
None) and metric is not
distance.Jaccard
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for not checking metric.supports_sparse instead of metric is not distance.Jaccard?

The condition that specifically checks for Jaccard a few lines later is needed because there is no specific flag signalling whether a metric supports distances by columns. Here, we have a flag to check, unless I overlooked something.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Jaccard is the only distance that supports discrete attributes via fallback. Other metrics fall back to sklearn's methods, which won't work with discrete.

or not metric.supports_discrete
or self.axis == 1 and metric is not distance.Jaccard):
if not data.domain.has_continuous_attributes():
Expand All @@ -132,14 +134,18 @@ def _fix_discrete():
def _fix_nonbinary():
nonlocal data
if metric is distance.Jaccard:
nbinary = sum(a.is_discrete and len(a.values) == 2
for a in data.domain.attributes)
if not nbinary:
self.Error.no_binary_features()
return False
elif nbinary < len(data.domain.attributes):
self.Warning.ignoring_nonbinary()
data = distance.remove_nonbinary_features(data)
if issparse(data.X):
# do not remove non-binary
return True
else:
nbinary = sum(a.is_discrete and len(a.values) == 2
for a in data.domain.attributes)
if not nbinary:
self.Error.no_binary_features()
return False
elif nbinary < len(data.domain.attributes):
self.Warning.ignoring_nonbinary()
data = distance.remove_nonbinary_features(data)
return True

def _fix_missing():
Expand Down