|
1 | 1 | import warnings
|
2 | 2 |
|
3 | 3 | import numpy as np
|
4 |
| - |
5 |
| -from scipy.sparse.linalg import eigsh as arpack_eigh |
| 4 | +import sklearn.manifold as skl_manifold |
| 5 | +import scipy.sparse as sp |
6 | 6 | from scipy.linalg import eigh as lapack_eigh
|
| 7 | +from scipy.sparse.linalg import eigsh as arpack_eigh |
7 | 8 |
|
8 |
| -import sklearn.manifold as skl_manifold |
| 9 | +import fastTSNE |
9 | 10 |
|
| 11 | +import Orange |
| 12 | +from Orange.data import Table, Domain, ContinuousVariable |
10 | 13 | from Orange.distance import Distance, DistanceModel, Euclidean
|
11 |
| -from Orange.projection import SklProjector |
| 14 | +from Orange.projection import SklProjector, Projector, Projection |
12 | 15 |
|
13 | 16 | __all__ = ["MDS", "Isomap", "LocallyLinearEmbedding", "SpectralEmbedding",
|
14 | 17 | "TSNE"]
|
@@ -110,9 +113,9 @@ def __init__(self, n_components=2, metric=True, n_init=4, max_iter=300,
|
110 | 113 | def __call__(self, data):
|
111 | 114 | params = self.params.copy()
|
112 | 115 | dissimilarity = params['dissimilarity']
|
113 |
| - if isinstance(self._metric, DistanceModel) \ |
114 |
| - or (isinstance(self._metric, type) |
115 |
| - and issubclass(self._metric, Distance)): |
| 116 | + if isinstance(self._metric, DistanceModel) or ( |
| 117 | + isinstance(self._metric, type) and issubclass(self._metric, Distance) |
| 118 | + ): |
116 | 119 | data = self.preprocess(data)
|
117 | 120 | _X, Y, domain = data.X, data.Y, data.domain
|
118 | 121 | X = dist_matrix = self._metric(_X)
|
@@ -178,31 +181,186 @@ def __init__(self, n_components=2, affinity='nearest_neighbors', gamma=None,
|
178 | 181 | self.params = vars()
|
179 | 182 |
|
180 | 183 |
|
181 |
| -class TSNE(SklProjector): |
182 |
| - __wraps__ = skl_manifold.TSNE |
183 |
| - name = 't-SNE' |
| 184 | +class TSNEModel(Projection): |
| 185 | + """A tSNE embedding object. Supports further optimization as well as |
| 186 | + adding new data into the existing embedding. |
184 | 187 |
|
185 |
| - def __init__(self, n_components=2, perplexity=30.0, early_exaggeration=4.0, |
186 |
| - learning_rate=1000.0, n_iter=1000, n_iter_without_progress=30, |
187 |
| - min_grad_norm=1e-07, metric='euclidean', init='random', |
188 |
| - random_state=None, method='barnes_hut', angle=0.5, n_jobs=1, |
189 |
| - preprocessors=None): |
| 188 | + Attributes |
| 189 | + ---------- |
| 190 | + embedding_ : fastTSNE.TSNEEmbedding |
| 191 | + The embedding object which takes care of subsequent optimizations of |
| 192 | + transforms. |
| 193 | + embedding : Table |
| 194 | + The embedding in an Orange table, easily accessible. |
| 195 | +
|
| 196 | + """ |
| 197 | + def __init__(self, embedding: fastTSNE.TSNEEmbedding, table: Table): |
| 198 | + self.embedding_ = embedding |
| 199 | + self.embedding = table |
| 200 | + |
| 201 | + def transform(self, X: np.ndarray, **kwargs) -> fastTSNE.PartialTSNEEmbedding: |
| 202 | + if sp.issparse(X): |
| 203 | + raise TypeError( |
| 204 | + 'A sparse matrix was passed, but dense data is required. Use ' |
| 205 | + 'X.toarray() to convert to a dense numpy array.' |
| 206 | + ) |
| 207 | + return self.embedding_.transform(X, **kwargs) |
| 208 | + |
| 209 | + def __call__(self, data: Table, **kwargs) -> Table: |
| 210 | + # If we want to transform new data, ensure that we use correct domain |
| 211 | + if data.domain != self.original_domain: |
| 212 | + data = data.transform(self.original_domain) |
| 213 | + |
| 214 | + embedding = self.transform(data.X, **kwargs) |
| 215 | + return Table(self.embedding.domain, embedding.view(), data.Y, data.metas) |
| 216 | + |
| 217 | + def optimize(self, n_iter, inplace=False, propagate_exception=False, **kwargs): |
| 218 | + """Resume optimization for the current embedding.""" |
| 219 | + kwargs = {'n_iter': n_iter, 'inplace': inplace, |
| 220 | + 'propagate_exception': propagate_exception, **kwargs} |
| 221 | + if inplace: |
| 222 | + self.embedding_.optimize(**kwargs) |
| 223 | + return self |
| 224 | + |
| 225 | + # If not inplace, we return a new TSNEModel object |
| 226 | + new_embedding = self.embedding_.optimize(**kwargs) |
| 227 | + table = Table(self.embedding.domain, new_embedding.view(np.ndarray), |
| 228 | + self.embedding.Y, self.embedding.metas) |
| 229 | + return TSNEModel(new_embedding, table) |
| 230 | + |
| 231 | + |
| 232 | +class TSNE(Projector): |
| 233 | + """t-distributed stochastic neighbor embedding (tSNE). |
| 234 | +
|
| 235 | + Parameters |
| 236 | + ---------- |
| 237 | + n_components : int |
| 238 | + The number of embedding that the embedding should contain. Note that |
| 239 | + only up to two dimensions are supported as otherwise the process can |
| 240 | + become prohibitively expensive. |
| 241 | + perplexity : float |
| 242 | + The desired perplexity of the probability distribution. |
| 243 | + learning_rate : float |
| 244 | + The learning rate for t-SNE. Typical values range from 1 to 1000. |
| 245 | + Setting the learning rate too high will result in the crowding problem |
| 246 | + where all the points form a ball in the center of the space. |
| 247 | + early_exaggeration_iter : int |
| 248 | + The number of iterations that the early exaggeration phase will be run |
| 249 | + for. Early exaggeration helps better separate clusters by increasing |
| 250 | + attractive forces between similar points. |
| 251 | + early_exaggeration : float |
| 252 | + The exaggeration term is used to increase the attractive forces during |
| 253 | + the first steps of the optimization. This enables points to move more |
| 254 | + easily through others, helping find their true neighbors quicker. |
| 255 | + n_iter : int |
| 256 | + The number of iterations to run the optimization after the early |
| 257 | + exaggeration phase. |
| 258 | + theta : float |
| 259 | + This is the trade-off parameter between speed and accuracy of the |
| 260 | + Barnes-Hut approximation of the negative forces. Setting a lower value |
| 261 | + will produce more accurate results, while setting a higher value will |
| 262 | + search through less of the space providing a rougher approximation. |
| 263 | + Scikit-learn recommends values between 0.2-0.8. This value is ignored |
| 264 | + unless the Barnes-Hut algorithm is used to compute negative gradients. |
| 265 | + min_num_intervals : int |
| 266 | + The minimum number of intervals into which we split our embedding. A |
| 267 | + larger value will produce better embeddings at the cost of performance. |
| 268 | + This value is ignored unless the interpolation based algorithm is used |
| 269 | + to compute negative gradients. |
| 270 | + ints_in_interval : float |
| 271 | + Since the coordinate range of the embedding will certainly change |
| 272 | + during optimization, this value tells us how many integer values should |
| 273 | + appear in a single interval. This number of intervals affect the |
| 274 | + embedding quality at the cost of performance. Less ints per interval |
| 275 | + will incur a larger number of intervals. This value is ignored unless |
| 276 | + the interpolation based algorithm is used to compute negative gradients. |
| 277 | + initialization : Optional[Union[np.ndarray, str]] |
| 278 | + An initial embedding strategy can be provided. A precomputed array with |
| 279 | + coordinates can be passed in, or optionally "random" or "pca" |
| 280 | + initializations are available. Note that while PCA can sometimes lead |
| 281 | + to faster convergence times, it can sometimes also lead to poor |
| 282 | + embeddings. Random initialization is typically a safe bet. |
| 283 | + metric : str |
| 284 | + The metric which will be used to evaluate the similarities between the |
| 285 | + input data points in the high dimensional space. |
| 286 | + n_jobs : int |
| 287 | + Parts of the algorithm can be in parallel and thus - faster. |
| 288 | + neighbors : str |
| 289 | + The method used to compute the nearest neighbors in the original, high |
| 290 | + dimensional data set. Possible values are "exact" or "approx" or any |
| 291 | + instance inheriting from `fastTSNE.nearest_neighbors.KNNIndex`. When |
| 292 | + dealing with larger data sets, approximate NN search is faster, when |
| 293 | + dealing with smaller data sets, exact NN search is typically faster. |
| 294 | + negative_gradient_method : str |
| 295 | + The method used to evaluate negative gradients (repulsive forces) in |
| 296 | + the embedding. Possible values are "bh" for Barnes-Hut or "fft" for |
| 297 | + Fast Fourier Accelerated Interpolation based tSNE or FItSNE for short. |
| 298 | + BH tends to be faster for smaller data sets but scales as O(n log n) |
| 299 | + while FItSNE is faster for larger data sets and scales linearly in the |
| 300 | + number of points. |
| 301 | + callbacks : Callable[[int, float, np.ndarray] -> bool] |
| 302 | + The callback should accept three parameters, the first is the current |
| 303 | + iteration, the second is the current KL divergence error and the last |
| 304 | + is the current embedding. The callback should return a boolean value |
| 305 | + indicating whether or not to stop optimization i.e. True to stop. |
| 306 | + This is convenient because returning `None` is falsey and helps avoid |
| 307 | + potential bugs if forgetting to return. Optionally, a list of callbacks |
| 308 | + is also supported. |
| 309 | + callbacks_every_iters : int |
| 310 | + How often should the callback be called. |
| 311 | + preprocessors |
| 312 | +
|
| 313 | + """ |
| 314 | + name = 't-SNE' |
| 315 | + preprocessors = [ |
| 316 | + Orange.preprocess.Continuize(), |
| 317 | + Orange.preprocess.SklImpute(), |
| 318 | + ] |
| 319 | + |
| 320 | + def __init__(self, n_components=2, perplexity=30, learning_rate=200, |
| 321 | + early_exaggeration_iter=250, early_exaggeration=12, |
| 322 | + n_iter=750, exaggeration=None, theta=0.5, min_num_intervals=10, |
| 323 | + ints_in_interval=1, initialization='random', metric='euclidean', |
| 324 | + n_jobs=1, neighbors='exact', negative_gradient_method='bh', callbacks=None, |
| 325 | + callbacks_every_iters=50, preprocessors=None): |
190 | 326 | super().__init__(preprocessors=preprocessors)
|
191 |
| - self.params = vars() |
| 327 | + self.tsne = fastTSNE.TSNE( |
| 328 | + n_components=n_components, perplexity=perplexity, |
| 329 | + learning_rate=learning_rate, early_exaggeration=early_exaggeration, |
| 330 | + early_exaggeration_iter=early_exaggeration_iter, n_iter=n_iter, |
| 331 | + exaggeration=exaggeration, theta=theta, min_num_intervals=min_num_intervals, |
| 332 | + ints_in_interval=ints_in_interval, initialization=initialization, |
| 333 | + metric=metric, n_jobs=n_jobs, neighbors=neighbors, |
| 334 | + negative_gradient_method=negative_gradient_method, |
| 335 | + callbacks=callbacks, callbacks_every_iters=callbacks_every_iters, |
| 336 | + ) |
192 | 337 |
|
193 |
| - def __call__(self, data): |
194 |
| - params = self.params.copy() |
195 |
| - metric = params["metric"] |
196 |
| - if metric == 'precomputed': |
197 |
| - X, Y, domain = data, None, None |
198 |
| - else: |
199 |
| - data = self.preprocess(data) |
200 |
| - X, Y, domain = data.X, data.Y, data.domain |
201 |
| - if isinstance(metric, Distance): |
202 |
| - X = metric(X) |
203 |
| - params['metric'] = 'precomputed' |
204 |
| - |
205 |
| - tsne = self.__wraps__(**params) |
206 |
| - tsne.fit(X, y=Y) |
207 |
| - tsne.domain = domain |
208 |
| - return tsne |
| 338 | + def fit(self, X: np.ndarray, Y: np.ndarray = None) -> fastTSNE.TSNEEmbedding: |
| 339 | + if sp.issparse(X): |
| 340 | + raise TypeError( |
| 341 | + 'A sparse matrix was passed, but dense data is required. Use ' |
| 342 | + 'X.toarray() to convert to a dense numpy array.' |
| 343 | + ) |
| 344 | + return self.tsne.fit(X) |
| 345 | + |
| 346 | + def __call__(self, data: Table) -> TSNEModel: |
| 347 | + # Preprocess the data - convert discrete to continuous |
| 348 | + data = self.preprocess(data) |
| 349 | + |
| 350 | + # Run tSNE optimization |
| 351 | + embedding = self.fit(data.X, data.Y) |
| 352 | + |
| 353 | + # The results should be accessible in an Orange table, which doesn't |
| 354 | + # need the full embedding attributes and is cast into a regular array |
| 355 | + tsne_cols = [ContinuousVariable('t-SNE-%d' % (i + 1)) |
| 356 | + for i in range(self.tsne.n_components)] |
| 357 | + embedding_domain = Domain(tsne_cols, data.domain.class_vars, data.domain.metas) |
| 358 | + embedding_table = Table(embedding_domain, embedding.view(np.ndarray), data.Y, data.metas) |
| 359 | + |
| 360 | + # Create a model object which will be capable of transforming new data |
| 361 | + # into the existing embedding |
| 362 | + model = TSNEModel(embedding, embedding_table) |
| 363 | + model.original_domain = data.domain |
| 364 | + model.name = self.name |
| 365 | + |
| 366 | + return model |
0 commit comments