Skip to content

Commit ee460d5

Browse files
authored
Merge pull request #3192 from pavlin-policar/tsne
[ENH] Replace scikit-learn tSNE with faster implementation
2 parents 62fbb26 + 46f8506 commit ee460d5

File tree

6 files changed

+373
-94
lines changed

6 files changed

+373
-94
lines changed

Orange/projection/manifold.py

+190-32
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
import warnings
22

33
import numpy as np
4-
5-
from scipy.sparse.linalg import eigsh as arpack_eigh
4+
import sklearn.manifold as skl_manifold
5+
import scipy.sparse as sp
66
from scipy.linalg import eigh as lapack_eigh
7+
from scipy.sparse.linalg import eigsh as arpack_eigh
78

8-
import sklearn.manifold as skl_manifold
9+
import fastTSNE
910

11+
import Orange
12+
from Orange.data import Table, Domain, ContinuousVariable
1013
from Orange.distance import Distance, DistanceModel, Euclidean
11-
from Orange.projection import SklProjector
14+
from Orange.projection import SklProjector, Projector, Projection
1215

1316
__all__ = ["MDS", "Isomap", "LocallyLinearEmbedding", "SpectralEmbedding",
1417
"TSNE"]
@@ -110,9 +113,9 @@ def __init__(self, n_components=2, metric=True, n_init=4, max_iter=300,
110113
def __call__(self, data):
111114
params = self.params.copy()
112115
dissimilarity = params['dissimilarity']
113-
if isinstance(self._metric, DistanceModel) \
114-
or (isinstance(self._metric, type)
115-
and issubclass(self._metric, Distance)):
116+
if isinstance(self._metric, DistanceModel) or (
117+
isinstance(self._metric, type) and issubclass(self._metric, Distance)
118+
):
116119
data = self.preprocess(data)
117120
_X, Y, domain = data.X, data.Y, data.domain
118121
X = dist_matrix = self._metric(_X)
@@ -178,31 +181,186 @@ def __init__(self, n_components=2, affinity='nearest_neighbors', gamma=None,
178181
self.params = vars()
179182

180183

181-
class TSNE(SklProjector):
182-
__wraps__ = skl_manifold.TSNE
183-
name = 't-SNE'
184+
class TSNEModel(Projection):
185+
"""A tSNE embedding object. Supports further optimization as well as
186+
adding new data into the existing embedding.
184187
185-
def __init__(self, n_components=2, perplexity=30.0, early_exaggeration=4.0,
186-
learning_rate=1000.0, n_iter=1000, n_iter_without_progress=30,
187-
min_grad_norm=1e-07, metric='euclidean', init='random',
188-
random_state=None, method='barnes_hut', angle=0.5, n_jobs=1,
189-
preprocessors=None):
188+
Attributes
189+
----------
190+
embedding_ : fastTSNE.TSNEEmbedding
191+
The embedding object which takes care of subsequent optimizations of
192+
transforms.
193+
embedding : Table
194+
The embedding in an Orange table, easily accessible.
195+
196+
"""
197+
def __init__(self, embedding: fastTSNE.TSNEEmbedding, table: Table):
198+
self.embedding_ = embedding
199+
self.embedding = table
200+
201+
def transform(self, X: np.ndarray, **kwargs) -> fastTSNE.PartialTSNEEmbedding:
202+
if sp.issparse(X):
203+
raise TypeError(
204+
'A sparse matrix was passed, but dense data is required. Use '
205+
'X.toarray() to convert to a dense numpy array.'
206+
)
207+
return self.embedding_.transform(X, **kwargs)
208+
209+
def __call__(self, data: Table, **kwargs) -> Table:
210+
# If we want to transform new data, ensure that we use correct domain
211+
if data.domain != self.original_domain:
212+
data = data.transform(self.original_domain)
213+
214+
embedding = self.transform(data.X, **kwargs)
215+
return Table(self.embedding.domain, embedding.view(), data.Y, data.metas)
216+
217+
def optimize(self, n_iter, inplace=False, propagate_exception=False, **kwargs):
218+
"""Resume optimization for the current embedding."""
219+
kwargs = {'n_iter': n_iter, 'inplace': inplace,
220+
'propagate_exception': propagate_exception, **kwargs}
221+
if inplace:
222+
self.embedding_.optimize(**kwargs)
223+
return self
224+
225+
# If not inplace, we return a new TSNEModel object
226+
new_embedding = self.embedding_.optimize(**kwargs)
227+
table = Table(self.embedding.domain, new_embedding.view(np.ndarray),
228+
self.embedding.Y, self.embedding.metas)
229+
return TSNEModel(new_embedding, table)
230+
231+
232+
class TSNE(Projector):
233+
"""t-distributed stochastic neighbor embedding (tSNE).
234+
235+
Parameters
236+
----------
237+
n_components : int
238+
The number of embedding that the embedding should contain. Note that
239+
only up to two dimensions are supported as otherwise the process can
240+
become prohibitively expensive.
241+
perplexity : float
242+
The desired perplexity of the probability distribution.
243+
learning_rate : float
244+
The learning rate for t-SNE. Typical values range from 1 to 1000.
245+
Setting the learning rate too high will result in the crowding problem
246+
where all the points form a ball in the center of the space.
247+
early_exaggeration_iter : int
248+
The number of iterations that the early exaggeration phase will be run
249+
for. Early exaggeration helps better separate clusters by increasing
250+
attractive forces between similar points.
251+
early_exaggeration : float
252+
The exaggeration term is used to increase the attractive forces during
253+
the first steps of the optimization. This enables points to move more
254+
easily through others, helping find their true neighbors quicker.
255+
n_iter : int
256+
The number of iterations to run the optimization after the early
257+
exaggeration phase.
258+
theta : float
259+
This is the trade-off parameter between speed and accuracy of the
260+
Barnes-Hut approximation of the negative forces. Setting a lower value
261+
will produce more accurate results, while setting a higher value will
262+
search through less of the space providing a rougher approximation.
263+
Scikit-learn recommends values between 0.2-0.8. This value is ignored
264+
unless the Barnes-Hut algorithm is used to compute negative gradients.
265+
min_num_intervals : int
266+
The minimum number of intervals into which we split our embedding. A
267+
larger value will produce better embeddings at the cost of performance.
268+
This value is ignored unless the interpolation based algorithm is used
269+
to compute negative gradients.
270+
ints_in_interval : float
271+
Since the coordinate range of the embedding will certainly change
272+
during optimization, this value tells us how many integer values should
273+
appear in a single interval. This number of intervals affect the
274+
embedding quality at the cost of performance. Less ints per interval
275+
will incur a larger number of intervals. This value is ignored unless
276+
the interpolation based algorithm is used to compute negative gradients.
277+
initialization : Optional[Union[np.ndarray, str]]
278+
An initial embedding strategy can be provided. A precomputed array with
279+
coordinates can be passed in, or optionally "random" or "pca"
280+
initializations are available. Note that while PCA can sometimes lead
281+
to faster convergence times, it can sometimes also lead to poor
282+
embeddings. Random initialization is typically a safe bet.
283+
metric : str
284+
The metric which will be used to evaluate the similarities between the
285+
input data points in the high dimensional space.
286+
n_jobs : int
287+
Parts of the algorithm can be in parallel and thus - faster.
288+
neighbors : str
289+
The method used to compute the nearest neighbors in the original, high
290+
dimensional data set. Possible values are "exact" or "approx" or any
291+
instance inheriting from `fastTSNE.nearest_neighbors.KNNIndex`. When
292+
dealing with larger data sets, approximate NN search is faster, when
293+
dealing with smaller data sets, exact NN search is typically faster.
294+
negative_gradient_method : str
295+
The method used to evaluate negative gradients (repulsive forces) in
296+
the embedding. Possible values are "bh" for Barnes-Hut or "fft" for
297+
Fast Fourier Accelerated Interpolation based tSNE or FItSNE for short.
298+
BH tends to be faster for smaller data sets but scales as O(n log n)
299+
while FItSNE is faster for larger data sets and scales linearly in the
300+
number of points.
301+
callbacks : Callable[[int, float, np.ndarray] -> bool]
302+
The callback should accept three parameters, the first is the current
303+
iteration, the second is the current KL divergence error and the last
304+
is the current embedding. The callback should return a boolean value
305+
indicating whether or not to stop optimization i.e. True to stop.
306+
This is convenient because returning `None` is falsey and helps avoid
307+
potential bugs if forgetting to return. Optionally, a list of callbacks
308+
is also supported.
309+
callbacks_every_iters : int
310+
How often should the callback be called.
311+
preprocessors
312+
313+
"""
314+
name = 't-SNE'
315+
preprocessors = [
316+
Orange.preprocess.Continuize(),
317+
Orange.preprocess.SklImpute(),
318+
]
319+
320+
def __init__(self, n_components=2, perplexity=30, learning_rate=200,
321+
early_exaggeration_iter=250, early_exaggeration=12,
322+
n_iter=750, exaggeration=None, theta=0.5, min_num_intervals=10,
323+
ints_in_interval=1, initialization='random', metric='euclidean',
324+
n_jobs=1, neighbors='exact', negative_gradient_method='bh', callbacks=None,
325+
callbacks_every_iters=50, preprocessors=None):
190326
super().__init__(preprocessors=preprocessors)
191-
self.params = vars()
327+
self.tsne = fastTSNE.TSNE(
328+
n_components=n_components, perplexity=perplexity,
329+
learning_rate=learning_rate, early_exaggeration=early_exaggeration,
330+
early_exaggeration_iter=early_exaggeration_iter, n_iter=n_iter,
331+
exaggeration=exaggeration, theta=theta, min_num_intervals=min_num_intervals,
332+
ints_in_interval=ints_in_interval, initialization=initialization,
333+
metric=metric, n_jobs=n_jobs, neighbors=neighbors,
334+
negative_gradient_method=negative_gradient_method,
335+
callbacks=callbacks, callbacks_every_iters=callbacks_every_iters,
336+
)
192337

193-
def __call__(self, data):
194-
params = self.params.copy()
195-
metric = params["metric"]
196-
if metric == 'precomputed':
197-
X, Y, domain = data, None, None
198-
else:
199-
data = self.preprocess(data)
200-
X, Y, domain = data.X, data.Y, data.domain
201-
if isinstance(metric, Distance):
202-
X = metric(X)
203-
params['metric'] = 'precomputed'
204-
205-
tsne = self.__wraps__(**params)
206-
tsne.fit(X, y=Y)
207-
tsne.domain = domain
208-
return tsne
338+
def fit(self, X: np.ndarray, Y: np.ndarray = None) -> fastTSNE.TSNEEmbedding:
339+
if sp.issparse(X):
340+
raise TypeError(
341+
'A sparse matrix was passed, but dense data is required. Use '
342+
'X.toarray() to convert to a dense numpy array.'
343+
)
344+
return self.tsne.fit(X)
345+
346+
def __call__(self, data: Table) -> TSNEModel:
347+
# Preprocess the data - convert discrete to continuous
348+
data = self.preprocess(data)
349+
350+
# Run tSNE optimization
351+
embedding = self.fit(data.X, data.Y)
352+
353+
# The results should be accessible in an Orange table, which doesn't
354+
# need the full embedding attributes and is cast into a regular array
355+
tsne_cols = [ContinuousVariable('t-SNE-%d' % (i + 1))
356+
for i in range(self.tsne.n_components)]
357+
embedding_domain = Domain(tsne_cols, data.domain.class_vars, data.domain.metas)
358+
embedding_table = Table(embedding_domain, embedding.view(np.ndarray), data.Y, data.metas)
359+
360+
# Create a model object which will be capable of transforming new data
361+
# into the existing embedding
362+
model = TSNEModel(embedding, embedding_table)
363+
model.original_domain = data.domain
364+
model.name = self.name
365+
366+
return model

Orange/tests/test_manifold.py

+108-21
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,19 @@
22
# pylint: disable=missing-docstring
33

44
import unittest
5+
56
import numpy as np
7+
from sklearn.metrics import accuracy_score
8+
from sklearn.neighbors import KNeighborsClassifier
69

10+
from Orange.data import Table
11+
from Orange.distance import Euclidean
712
from Orange.projection import (MDS, Isomap, LocallyLinearEmbedding,
813
SpectralEmbedding, TSNE)
914
from Orange.projection.manifold import torgerson
10-
from Orange.distance import Euclidean
11-
from Orange.data import Table
15+
16+
17+
np.random.seed(42)
1218

1319

1420
class TestManifold(unittest.TestCase):
@@ -117,25 +123,6 @@ def __se_test_helper(self, data, n_com):
117123
se = se(data)
118124
self.assertEqual((data.X.shape[0], n_com), se.embedding_.shape)
119125

120-
def test_tsne(self):
121-
data = self.ionosphere[:50]
122-
for i in range(1, 4):
123-
self.__tsne_test_helper(data, n_com=i)
124-
125-
def __tsne_test_helper(self, data, n_com):
126-
tsne_def = TSNE(n_components=n_com, metric='euclidean')
127-
tsne_def = tsne_def(data)
128-
129-
tsne_euc = TSNE(n_components=n_com, metric=Euclidean)
130-
tsne_euc = tsne_euc(data)
131-
132-
tsne_pre = TSNE(n_components=n_com, metric='precomputed')
133-
tsne_pre = tsne_pre(Euclidean(data))
134-
135-
self.assertEqual((data.X.shape[0], n_com), tsne_def.embedding_.shape)
136-
self.assertEqual((data.X.shape[0], n_com), tsne_euc.embedding_.shape)
137-
self.assertEqual((data.X.shape[0], n_com), tsne_pre.embedding_.shape)
138-
139126
def test_torgerson(self):
140127
data = self.ionosphere[::5]
141128
dis = Euclidean(data)
@@ -149,3 +136,103 @@ def test_torgerson(self):
149136

150137
with self.assertRaises(ValueError):
151138
torgerson(dis, eigen_solver="madness")
139+
140+
141+
class TestTSNE(unittest.TestCase):
142+
@classmethod
143+
def setUpClass(cls):
144+
cls.iris = Table('iris')
145+
146+
def test_fit(self):
147+
n_components = 2
148+
tsne = TSNE(n_components=n_components)
149+
model = tsne(self.iris)
150+
151+
# The embedding should have the correct number of dimensions
152+
self.assertEqual(model.embedding.X.shape, (self.iris.X.shape[0], n_components))
153+
154+
# The embedding should not contain NaNs
155+
self.assertFalse(np.any(np.isnan(model.embedding.X)))
156+
157+
# The embeddings in the table should match the embedding object
158+
np.testing.assert_equal(model.embedding.X, model.embedding_)
159+
160+
def test_transform(self):
161+
# Set perplexity to avoid warnings
162+
tsne = TSNE(perplexity=10)
163+
model = tsne(self.iris[::2])
164+
new_embedding = model(self.iris[1::2])
165+
166+
# The new embedding should not contain NaNs
167+
self.assertFalse(np.any(np.isnan(new_embedding.X)))
168+
169+
def test_continue_optimization(self):
170+
tsne = TSNE(n_iter=100)
171+
model = tsne(self.iris)
172+
new_model = model.optimize(100, inplace=False)
173+
174+
# If we don't do things inplace, then the instances should be different
175+
self.assertIsNot(model, new_model)
176+
self.assertIsNot(model.embedding, new_model.embedding)
177+
self.assertIsNot(model.embedding_, new_model.embedding_)
178+
179+
self.assertFalse(np.allclose(model.embedding.X, new_model.embedding.X),
180+
'Embedding should change after further optimization.')
181+
182+
# The embeddings in the table should match the embedding object
183+
np.testing.assert_equal(new_model.embedding.X, new_model.embedding_)
184+
185+
def test_continue_optimization_inplace(self):
186+
tsne = TSNE(n_iter=100)
187+
model = tsne(self.iris)
188+
new_model = model.optimize(100, inplace=True)
189+
190+
# If we don't do things inplace, then the instances should be the same
191+
self.assertIs(model, new_model)
192+
self.assertIs(model.embedding, new_model.embedding)
193+
self.assertIs(model.embedding_, new_model.embedding_)
194+
195+
# The embeddings in the table should match the embedding object
196+
np.testing.assert_equal(new_model.embedding.X, new_model.embedding_)
197+
198+
def test_bh_correctness(self):
199+
knn = KNeighborsClassifier(n_neighbors=5)
200+
201+
# Set iterations to 0 so we check that the initialization is fairly random
202+
tsne = TSNE(early_exaggeration_iter=0, n_iter=0, perplexity=30,
203+
negative_gradient_method='bh', initialization='random')
204+
model = tsne(self.iris)
205+
206+
# Evaluate KNN on the random initialization
207+
knn.fit(model.embedding_, self.iris.Y)
208+
predicted = knn.predict(model.embedding_)
209+
self.assertTrue(accuracy_score(predicted, self.iris.Y) < 0.6)
210+
211+
# 100 iterations should be enough for iris
212+
model.optimize(n_iter=100, inplace=True)
213+
214+
# Evaluate KNN on the tSNE embedding
215+
knn.fit(model.embedding_, self.iris.Y)
216+
predicted = knn.predict(model.embedding_)
217+
self.assertTrue(accuracy_score(predicted, self.iris.Y) > 0.95)
218+
219+
def test_fft_correctness(self):
220+
knn = KNeighborsClassifier(n_neighbors=5)
221+
222+
# Set iterations to 0 so we check that the initialization is fairly random
223+
tsne = TSNE(early_exaggeration_iter=0, n_iter=0, perplexity=30,
224+
negative_gradient_method='fft', initialization='random')
225+
model = tsne(self.iris)
226+
227+
# Evaluate KNN on the random initialization
228+
knn.fit(model.embedding_, self.iris.Y)
229+
predicted = knn.predict(model.embedding_)
230+
self.assertTrue(accuracy_score(predicted, self.iris.Y) < 0.6)
231+
232+
# 100 iterations should be enough for iris
233+
model.optimize(n_iter=100, inplace=True)
234+
235+
# Evaluate KNN on the tSNE embedding
236+
knn.fit(model.embedding_, self.iris.Y)
237+
predicted = knn.predict(model.embedding_)
238+
self.assertTrue(accuracy_score(predicted, self.iris.Y) > 0.95)

0 commit comments

Comments
 (0)