Skip to content

Commit bb88643

Browse files
Merge pull request #236 from pavlin-policar/fix-bh-memory
Fix exploding memory problem in BH for duplicate rows
2 parents 99e97d9 + 67bd399 commit bb88643

File tree

4 files changed

+37
-15
lines changed

4 files changed

+37
-15
lines changed

openTSNE/_tsne.pyx

+6-6
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ from cython.parallel import prange, parallel
1010
from cpython.mem cimport PyMem_Malloc, PyMem_Free
1111
from libc.stdlib cimport malloc, free
1212

13-
from .quad_tree cimport QuadTree, Node, is_duplicate
13+
from .quad_tree cimport QuadTree, Node, is_close
1414
from ._matrix_mul.matrix_mul cimport matrix_multiply_fft_1d, matrix_multiply_fft_2d
1515

1616

@@ -178,7 +178,7 @@ cpdef double estimate_negative_gradient_bh(
178178
Py_ssize_t num_threads=1,
179179
bint pairwise_normalization=True,
180180
):
181-
"""Estimate the negative tSNE gradient using the Barnes Hut approximation.
181+
"""Estimate the negative t-SNE gradient using the Barnes-Hut approximation.
182182
183183
Notes
184184
-----
@@ -197,7 +197,7 @@ cpdef double estimate_negative_gradient_bh(
197197
num_threads = 1
198198

199199
# In order to run gradient estimation in parallel, we need to pass each
200-
# worker it's own memory slot to write sum_Qs
200+
# worker its own memory slot to write sum_Qs
201201
for i in prange(num_points, nogil=True, num_threads=num_threads, schedule="guided"):
202202
_estimate_negative_gradient_single(
203203
&tree.root, &embedding[i, 0], &gradient[i, 0], &sum_Qi[i], theta, dof
@@ -225,16 +225,16 @@ cdef void _estimate_negative_gradient_single(
225225
double theta,
226226
double dof,
227227
) nogil:
228-
# Make sure that we spend no time on empty nodes or self-interactions
229-
if node.num_points == 0 or node.is_leaf and is_duplicate(node, point):
228+
# Make sure that we spend no time on empty nodes or simple self-interactions
229+
if node.num_points == 0 or node.is_leaf and is_close(node, point, EPSILON):
230230
return
231231

232232
cdef:
233233
double distance = EPSILON
234234
double q_ij, tmp
235235
Py_ssize_t d
236236

237-
# Compute the squared euclidean disstance in the embedding space from the
237+
# Compute the squared euclidean distance in the embedding space from the
238238
# new point to the center of mass
239239
for d in range(node.n_dims):
240240
tmp = node.center_of_mass[d] - point[d]

openTSNE/quad_tree.pxd

+1-3
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
# cython: language_level=3
66
import numpy as np
77

8-
cdef double EPSILON = np.finfo(np.float64).eps
9-
108
ctypedef struct Node:
119
Py_ssize_t n_dims
1210
double *center
@@ -19,7 +17,7 @@ ctypedef struct Node:
1917
Py_ssize_t num_points
2018

2119

22-
cdef bint is_duplicate(Node * node, double * point, double duplicate_eps=*) nogil
20+
cdef bint is_close(Node * node, double * point, double eps) nogil
2321

2422

2523
cdef class QuadTree:

openTSNE/quad_tree.pyx

+7-6
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,15 @@ cdef Py_ssize_t get_child_idx_for(Node * node, double * point) nogil:
7171
cdef inline void update_center_of_mass(Node * node, double * point) nogil:
7272
cdef Py_ssize_t d
7373
for d in range(node.n_dims):
74-
node.center_of_mass[d] = (node.center_of_mass[d] * node.num_points + point[d]) \
75-
/ (node.num_points + 1)
74+
node.center_of_mass[d] = (node.center_of_mass[d] * node.num_points + point[d]) / (node.num_points + 1)
7675
node.num_points += 1
7776

7877

7978
cdef void add_point_to(Node * node, double * point):
80-
# If the node is a leaf node and empty, we"re done
81-
if node.is_leaf and node.num_points == 0 or is_duplicate(node, point):
79+
# If the node is a leaf node and empty, we're done. We'll also limit the
80+
# branching of each node to prevent memory explosions. 1e-6 here is
81+
# effectively the minimum size that a node can take
82+
if node.is_leaf and node.num_points == 0 or is_close(node, point, 1e-6):
8283
update_center_of_mass(node, point)
8384
return
8485

@@ -124,10 +125,10 @@ cdef void split_node(Node * node):
124125
PyMem_Free(new_center)
125126

126127

127-
cdef inline bint is_duplicate(Node * node, double * point, double duplicate_eps=1e-16) nogil:
128+
cdef inline bint is_close(Node * node, double * point, double eps) nogil:
128129
cdef Py_ssize_t d
129130
for d in range(node.n_dims):
130-
if fabs(node.center_of_mass[d] - point[d]) >= duplicate_eps:
131+
if fabs(node.center_of_mass[d] - point[d]) >= eps:
131132
return False
132133
return True
133134

tests/test_correctness.py

+23
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,26 @@ def test_early_exaggeration_does_not_collapse(self):
333333
x = np.random.randn(n, d)
334334
embedding = openTSNE.TSNE(random_state=42).fit(x)
335335
self.assertGreater(np.max(np.abs(embedding)), 1e-8)
336+
337+
338+
class TestDataMatricesWithDuplicatedRows(unittest.TestCase):
339+
@classmethod
340+
def setUpClass(cls) -> None:
341+
from sklearn.preprocessing import KBinsDiscretizer
342+
343+
# Load up contrived example where we have a large number of duplicated
344+
# rows. This is similar to the Titanic data set, which is problematic.
345+
np.random.seed(0)
346+
iris = datasets.load_iris()
347+
x, y = iris.data, iris.target
348+
349+
discretizer = KBinsDiscretizer(n_bins=2, strategy="uniform")
350+
x = discretizer.fit_transform(x).toarray()
351+
352+
idx = np.random.choice(x.shape[0], size=1000, replace=True)
353+
cls.x, cls.y = x[idx], y[idx]
354+
355+
def test_works_without_error(self):
356+
openTSNE.TSNE(
357+
early_exaggeration=100, negative_gradient_method="bh", random_state=0
358+
).fit(self.x)

0 commit comments

Comments
 (0)