Skip to content

Commit b0793ac

Browse files
authored
Merge pull request #1515 from ales-erjavec/ward
[ENH] Enable Ward clustering in Hierarchical clustering widget
2 parents d464b58 + 0982585 commit b0793ac

File tree

5 files changed

+68
-24
lines changed

5 files changed

+68
-24
lines changed

Orange/clustering/hierarchical.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,14 @@
11
from collections import namedtuple, deque
22
from operator import attrgetter
33
from itertools import chain, count
4+
from distutils.version import LooseVersion as _LooseVersion
5+
46
import heapq
57
import numpy
68

79
import scipy.cluster.hierarchy
10+
import scipy.spatial.distance
11+
812
from Orange.distance import Euclidean, PearsonR
913

1014
__all__ = ['HierarchicalClustering']
@@ -15,6 +19,17 @@
1519
WEIGHTED = "weighted"
1620
WARD = "ward"
1721

22+
# Does scipy implement a O(n**2) NN chain algorithm?
23+
_HAS_NN_CHAIN = hasattr(scipy.cluster.hierarchy, "_hierarchy") and \
24+
hasattr(scipy.cluster.hierarchy._hierarchy, "nn_chain")
25+
26+
# Prior to 0.18 scipy.cluster.hierarchical's python interface disallowed
27+
# ward clustering from a precomputed distance matrix even though it's cython
28+
# implementation allowed it and was documented to support it (scipy issue 5220)
29+
_HAS_WARD_LINKAGE_FROM_DIST = \
30+
_LooseVersion(scipy.__version__) >= _LooseVersion("0.18") and \
31+
_HAS_NN_CHAIN
32+
1833

1934
def condensedform(X, mode="upper"):
2035
X = numpy.asarray(X)
@@ -86,7 +101,23 @@ def dist_matrix_linkage(matrix, linkage=AVERAGE):
86101
"""
87102
# Extract compressed upper triangular distance matrix.
88103
distances = condensedform(matrix)
89-
return scipy.cluster.hierarchy.linkage(distances, method=linkage)
104+
if linkage == WARD and not _HAS_WARD_LINKAGE_FROM_DIST:
105+
# Avoid `scipy.cluster.hierarchy.linkage` and dispatch to it's
106+
# cython implementation directly.
107+
# This the core of the scipy.cluster.hierarchy.linkage in
108+
# scipy 0.16, 0.17. Assuming the branches are in bug fix mode
109+
# only so this interface will not change.
110+
y = numpy.asarray(distances, dtype=float)
111+
scipy.spatial.distance.is_valid_y(y, throw=True)
112+
N = scipy.spatial.distance.num_obs_y(y)
113+
# allocate the output linkage matrix
114+
Z = numpy.zeros((N - 1, 4))
115+
# retrieve the correct method flag
116+
method = scipy.cluster.hierarchy._cpy_euclid_methods["ward"]
117+
scipy.cluster.hierarchy._hierarchy.linkage(y, Z, int(N), int(method))
118+
return Z
119+
else:
120+
return scipy.cluster.hierarchy.linkage(distances, method=linkage)
90121

91122

92123
def dist_matrix_clustering(matrix, linkage=AVERAGE):
@@ -96,9 +127,7 @@ def dist_matrix_clustering(matrix, linkage=AVERAGE):
96127
:param Orange.misc.DistMatrix matrix:
97128
:param str linkage:
98129
"""
99-
# Extract compressed upper triangular distance matrix.
100-
distances = condensedform(matrix)
101-
Z = scipy.cluster.hierarchy.linkage(distances, method=linkage)
130+
Z = dist_matrix_linkage(matrix, linkage=linkage)
102131
return tree_from_linkage(Z)
103132

104133

@@ -363,12 +392,11 @@ def item(node):
363392
heap = [item(tree)]
364393

365394
while len(heap) < k:
366-
key, cl = heapq.heappop(heap)
395+
_, cl = heap[0] # peek
367396
if cl.is_leaf:
368397
assert all(n.is_leaf for _, n in heap)
369-
heapq.heappush(heap, (key, cl))
370398
break
371-
399+
key, cl = heapq.heappop(heap)
372400
left, right = cl.left, cl.right
373401
heapq.heappush(heap, item(left))
374402
heapq.heappush(heap, item(right))

Orange/tests/test_clustering_hierarchical.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,16 @@ def test_prunning(self):
7070
pruned = hierarchical.prune(self.cluster, height=10)
7171
self.assertTrue(c.height >= 10 for c in hierarchical.preorder(pruned))
7272

73+
top = hierarchical.top_clusters(self.cluster, 3)
74+
self.assertEqual(len(top), 3)
75+
76+
top = hierarchical.top_clusters(self.cluster, len(self.matrix))
77+
self.assertEqual(len(top), len(self.matrix))
78+
self.assertTrue(all(n.is_leaf for n in top))
79+
80+
top1 = hierarchical.top_clusters(self.cluster, len(self.matrix) + 1)
81+
self.assertEqual(top1, top)
82+
7383
def test_form(self):
7484
m = [[0, 2, 3, 4],
7585
[2, 0, 6, 7],
@@ -117,6 +127,14 @@ def score(root):
117127
self.assertGreater(score_unordered, score_ordered)
118128
self.assertEqual(score_ordered, 21.0)
119129

130+
def test_table_clustering(self):
131+
table = Orange.data.Table(numpy.eye(3))
132+
tree = hierarchical.data_clustering(table, linkage="single")
133+
numpy.testing.assert_almost_equal(tree.value.height, numpy.sqrt(2))
134+
135+
tree = hierarchical.feature_clustering(table)
136+
numpy.testing.assert_almost_equal(tree.value.height, 0.75)
137+
120138

121139
class TestTree(unittest.TestCase):
122140
def test_tree(self):

Orange/widgets/unsupervised/owdistancemap.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,10 @@ class OWDistanceMap(widget.OWWidget):
263263
graph_name = "grid_widget"
264264

265265
# Disable clustering for inputs bigger than this
266-
_MaxClustering = 3000
266+
if hierarchical._HAS_NN_CHAIN:
267+
_MaxClustering = 25000
268+
else:
269+
_MaxClustering = 3000
267270

268271
# Disable cluster leaf ordering for inputs bigger than this
269272
_MaxOrderedClustering = 1000

Orange/widgets/unsupervised/owhierarchicalclustering.py

Lines changed: 6 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,9 @@
33

44
from collections import namedtuple, OrderedDict
55
from itertools import chain
6-
from functools import reduce
76
from contextlib import contextmanager
87

98
import numpy
10-
import scipy.cluster.hierarchy
119

1210
from PyQt4.QtGui import (
1311
QGraphicsWidget, QGraphicsObject, QGraphicsLinearLayout, QGraphicsPathItem,
@@ -26,19 +24,17 @@
2624
from Orange.data.domain import filter_visible
2725
import Orange.misc
2826
from Orange.clustering.hierarchical import \
29-
postorder, preorder, Tree, tree_from_linkage, leaves, prune, top_clusters
27+
postorder, preorder, Tree, tree_from_linkage, dist_matrix_linkage, \
28+
leaves, prune, top_clusters
3029

3130
from Orange.widgets import widget, gui, settings
3231
from Orange.widgets.utils import colorpalette, itemmodels
3332
from Orange.widgets.io import FileFormat
3433

3534
__all__ = ["OWHierarchicalClustering"]
3635

37-
# In scipy 0.14 ward linkage cannot be computed from the distance
38-
# matrix alone, it requires the full data matrix (in 0.15 the whole
39-
# hierarchical clustering is/will be reimplemented and from the
40-
# looks of it will support ward from dist matrix).
41-
LINKAGE = ["Single", "Average", "Weighted", "Complete"]
36+
37+
LINKAGE = ["Single", "Average", "Weighted", "Complete", "Ward"]
4238

4339

4440
def dendrogram_layout(tree, expand_leaves=False):
@@ -1003,14 +999,9 @@ def _update(self):
1003999
distances = self.matrix
10041000

10051001
if distances is not None:
1006-
# Convert to flat upper triangular distances
1007-
i, j = numpy.triu_indices(distances.shape[0], k=1)
1008-
distances = numpy.asarray(distances[i, j])
1009-
10101002
method = LINKAGE[self.linkage].lower()
1011-
Z = scipy.cluster.hierarchy.linkage(
1012-
distances, method=method
1013-
)
1003+
Z = dist_matrix_linkage(distances, linkage=method)
1004+
10141005
tree = tree_from_linkage(Z)
10151006
self.linkmatrix = Z
10161007
self.root = tree

Orange/widgets/visualize/owheatmap.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
from Orange.widgets.unsupervised.owhierarchicalclustering import \
2929
DendrogramWidget
30+
3031
from Orange.widgets.widget import Msg
3132

3233

@@ -398,7 +399,10 @@ class OWHeatMap(widget.OWWidget):
398399
(OrderedClustering, "Clustering with leaf ordering")
399400
]
400401
# Disable clustering for inputs bigger than this
401-
_MaxClustering = 3000
402+
if hierarchical._HAS_NN_CHAIN:
403+
_MaxClustering = 25000
404+
else:
405+
_MaxClustering = 3000
402406

403407
# Disable cluster leaf ordering for inputs bigger than this
404408
_MaxOrderedClustering = 1000

0 commit comments

Comments
 (0)