biolab · janezd · Mar 15, 2019 · Mar 1, 2019 · Mar 4, 2019 · Mar 4, 2019
diff --git a/Orange/distance/base.py b/Orange/distance/base.py
@@ -1,5 +1,6 @@
 import numpy as np
 import sklearn.metrics as skl_metrics
+from scipy.sparse import issparse, csr_matrix
 
 from Orange.data import Table, Domain, Instance, RowInstance
 from Orange.misc import DistMatrix
@@ -13,6 +14,7 @@
 # TODO this *private* function is called from several widgets to prepare
 # data for calling the below classes. After we (mostly) stopped relying
 # on sklearn.metrics, this is (mostly) unnecessary
+
 def _preprocess(table, impute=True):
     """Remove categorical attributes and impute missing values."""
     if not len(table):
@@ -499,3 +501,51 @@ def __call__(self, e1, e2=None, axis=1, impute=False):
         else:
             dist_matrix = DistMatrix(dist)
         return dist_matrix
+
+class SparseJaccard:
+    """
+    Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is
+    sparse, data normalized with intersection/union. Sklearn's function can't
+    handle discrete or missing data and normalization.
+    """
+
+    def __call__(self, e1, e2=None, axis=1, impute=False):
+        x1 = _orange_to_numpy(e1)
+        x2 = _orange_to_numpy(e2)
+        if axis == 0:
+            x1 = x1.T
+            if x2 is not None:
+                x2 = x2.T
+        if issparse(x1):
+            dist = self.sparse_jaccard(x1, x2)
+        else:
+            dist = skl_metrics.pairwise.pairwise_distances(x1,
+                                                           x2,
+                                                           metric="jaccard")
+        if impute and np.isnan(dist).any():
+            dist = np.nan_to_num(dist)
+        if isinstance(e1, (Table, RowInstance)):
+            dist_matrix = DistMatrix(dist, e1, e2, axis)
+        else:
+            dist_matrix = DistMatrix(dist)
+        return dist_matrix
+
+    def sparse_jaccard(self, x1, x2=None):
+        symmetric = x2 is None
+        if symmetric:
+            x2 = x1
+        x1 = csr_matrix(x1)
+        x1.eliminate_zeros()
+        x2 = csr_matrix(x2)
+        x2.eliminate_zeros()
+        n, m = x1.shape[0], x2.shape[0]
+        matrix = np.zeros((n, m))
+        for i in range(n):
+            xi_ind = set(x1[i].indices)
+            for j in range(i if symmetric else m):
+                jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
+                           / len(set(x1[i].indices).union(x1[j].indices))
+                matrix[i, j] = jacc
+                if symmetric:
+                    matrix[j, i] = jacc
+        return matrix
diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
@@ -11,7 +11,7 @@
 from Orange.statistics import util
 
 from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
-                   SklDistance, _orange_to_numpy)
+                   SklDistance, _orange_to_numpy, SparseJaccard)
 
 class EuclideanRowsModel(FittedDistanceModel):
     """
@@ -416,9 +416,9 @@ def compute_distances(self, x1, x2):
 
 
 class Jaccard(FittedDistance):
-    supports_sparse = False
+    supports_sparse = True
     supports_discrete = True
-    fallback = SklDistance('jaccard')
+    fallback = SparseJaccard()
     ModelType = JaccardModel
 
     def fit_rows(self, attributes, x, n_vals):

diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py
@@ -27,11 +27,13 @@ def test_no_data(self):
 
     def test_sparse(self):
         """Test sparse support in distances."""
-        sparse_iris = csr_matrix(Table('iris').X)
         if not self.Distance.supports_sparse:
-            self.assertRaises(TypeError, self.Distance, sparse_iris)
+            self.assertRaises(TypeError, self.Distance, self.sparse_data)
         else:
-            self.Distance(sparse_iris)
+            # check the result is the same as for dense
+            dist_numpy = self.Distance(self.dense_X)
+            dist_sparse = self.Distance(self.sparse_data)
+            np.testing.assert_allclose(dist_sparse, dist_numpy)
 
 
 class CommonFittedTests(CommonTests):
@@ -144,6 +146,12 @@ def setUp(self):
         self.mixed_data = self.data = Table.from_numpy(
             self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))
 
+        self.dense_X = np.array([[1, 0, 2],
+                                 [-1, 5, 0],
+                                 [0, 1, 1],
+                                 [7, 0, 0]])
+        self.sparse_data = Table(csr_matrix(self.dense_X))
+
 
 
 # Correct results in these tests were computed manually or with Excel;
@@ -838,6 +846,12 @@ def setUp(self):
              [1, 0, 1],
              [1, 0, 0]])
 
+        self.dense_X = np.array([[1, 0, 2],
+                      [-1, 5, 0],
+                      [0, 1, 1],
+                      [7, 0, 0]])
+        self.sparse_data = Table(csr_matrix(self.dense_X))
+
     def test_jaccard_rows(self):
         assert_almost_equal = np.testing.assert_almost_equal
 

diff --git a/Orange/widgets/unsupervised/owdistances.py b/Orange/widgets/unsupervised/owdistances.py
@@ -119,7 +119,9 @@ def _check_sparse():
         def _fix_discrete():
             nonlocal data
             if data.domain.has_discrete_attributes() and (
-                    issparse(data.X) and getattr(metric, "fallback", None)
+                    issparse(data.X) and getattr(metric, "fallback",
+                                                 None) and metric is not
+                                distance.Jaccard
                     or not metric.supports_discrete
                     or self.axis == 1 and metric is not distance.Jaccard):
                 if not data.domain.has_continuous_attributes():
@@ -132,14 +134,18 @@ def _fix_discrete():
         def _fix_nonbinary():
             nonlocal data
             if metric is distance.Jaccard:
-                nbinary = sum(a.is_discrete and len(a.values) == 2
-                              for a in data.domain.attributes)
-                if not nbinary:
-                    self.Error.no_binary_features()
-                    return False
-                elif nbinary < len(data.domain.attributes):
-                    self.Warning.ignoring_nonbinary()
-                    data = distance.remove_nonbinary_features(data)
+                if issparse(data.X):
+                    # do not remove non-binary
+                    return True
+                else:
+                    nbinary = sum(a.is_discrete and len(a.values) == 2
+                                  for a in data.domain.attributes)
+                    if not nbinary:
+                        self.Error.no_binary_features()
+                        return False
+                    elif nbinary < len(data.domain.attributes):
+                        self.Warning.ignoring_nonbinary()
+                        data = distance.remove_nonbinary_features(data)
             return True
 
         def _fix_missing():