Merge pull request #11 from beringresearch/remove-supervised

idroz · web-flow · commit 25d48d8455c8 · 2018-12-16T17:29:02.000Z
Remove supervised
diff --git a/R-package/R/ivis.R b/R-package/R/ivis.R
@@ -1,7 +1,6 @@
 #' IVIS algorithm
 #'
 #' @param X numerical matrix to be reduced. Columns correspond to features.
-#' @param y int, optional (default: NULL). Optional class vector triggering supervised tripplet selection.
 #' @param embedding_dims int, optional (default: 2) Number of dimensions in the embedding space
 #' @param k int, optional (default: 150)
 #'        The number of neighbours to retrieve for each point
@@ -24,7 +23,7 @@
 #'        Whether to pre-compute the nearest neighbours. Pre-computing is significantly faster, but requires more memory. If memory is limited, try setting this to False.
 #' @export
 
-ivis <- function(X, y = NULL, embedding_dims = 2L,
+ivis <- function(X, embedding_dims = 2L,
     k = 150L,
     distance = "pn",
     batch_size = 128L,
@@ -52,7 +51,7 @@ ivis <- function(X, y = NULL, embedding_dims = 2L,
         epochs = epochs, n_epochs_without_progress = n_epochs_without_progress,
         margin = margin, ntrees = ntrees, search_k = search_k, precompute = precompute)
     
-    embeddings = model$fit_transform(X = X, y = y)
+    embeddings = model$fit_transform(X = X)
     return(embeddings)
 
     }
diff --git a/README.md b/README.md
@@ -10,8 +10,6 @@ After cloning this repo run: `pip install -r requirements.txt --editable .` from
 
 ## Examples
 
-Ivis can be run in both unsupervised and supervised mode. To run in supservised mode, simply provide an array of labels to the .fit() method.
-
 ### Unsupervised embeddings
 
 ```
@@ -20,7 +18,6 @@ from sklearn import datasets
 
 iris = datasets.load_iris()
 X = iris.data
-y = iris.target
 
 model = Ivis(embedding_dims=2, k=15)
 
@@ -31,23 +28,6 @@ Plotting the embeddings results in the following visualization:
 
 ![](docs/ivis-iris-demo.png)
 
-### Supervised embeddings
-
-```
-from keras.datasets import mnist
-import numpy as np
-from ivis import Ivis
-
-(x_train, y_train), (x_test, y_test) = mnist.load_data()
-x_test = np.reshape(x_test.astype('float32'), (len(x_test), 28 * 28))
-
-
-model = Ivis()
-embeddings = model.fit_transform(x_test, y_test)
-```
-
-![](docs/ivis_mnist_supervised_embeddings.png)
-
 ### Training an a .h5 dataset
 
 Load the data using a HDF5Matrix object provided by keras.  
diff --git a/examples/.ipynb_checkpoints/iris_dimensionality_reduction-checkpoint.ipynb b/examples/.ipynb_checkpoints/iris_dimensionality_reduction-checkpoint.ipynb
diff --git a/examples/iris_dimensionality_reduction.ipynb b/examples/iris_dimensionality_reduction.ipynb
diff --git a/ivis/data/triplet_generators.py b/ivis/data/triplet_generators.py
@@ -61,10 +61,6 @@ def create_triplet_generator_from_annoy_index(X, index, k, batch_size, search_k=
         return generate_knn_triplets_from_annoy_index(X, index, k=k, batch_size=batch_size, search_k=search_k)
 
 
-def create_triplet_generator_from_labels(X, y, batch_size):
-    return generate_triplets_from_labels(X, np.array(y), batch_size=batch_size)
-
-
 def knn_triplet_from_neighbour_list(X, index, neighbour_list):
     """ A random (unweighted) positive example chosen. """
     N_ROWS = X.shape[0]
@@ -154,53 +150,6 @@ def generate_knn_triplets_from_annoy_index(X, annoy_index, k=150, batch_size=32,
         triplet_batch = np.array(triplet_batch)
         yield ([triplet_batch[:,0], triplet_batch[:,1], triplet_batch[:,2]], placeholder_labels)
 
-@threadsafe_generator
-def generate_triplets_from_labels(X, Y, batch_size=32):
-    N_ROWS = X.shape[0]
-    iterations = 0
-    row_indexes = np.array(list(range(N_ROWS)), dtype=np.uint32)
-    np.random.shuffle(row_indexes)
-
-    placeholder_labels = np.array([0 for i in range(batch_size)])
-
-    while True:
-        triplet_batch = []
-        
-        for i in range(batch_size):
-            if iterations >= N_ROWS:
-                np.random.shuffle(row_indexes)
-                iterations = 0
-           
-            triplet = triplet_from_labels(X, Y, row_indexes[iterations])            
-            
-            triplet_batch += triplet
-            iterations += 1
-        
-        if (issparse(X)):
-            triplet_batch = [[e.toarray()[0] for e in t] for t in triplet_batch]                 
-            
-        triplet_batch = np.array(triplet_batch)        
-        yield ([triplet_batch[:,0], triplet_batch[:,1], triplet_batch[:,2]], placeholder_labels)
-
-def triplet_from_labels(X, Y, index):
-    """ A random (unweighted) positive example chosen. """
-    N_ROWS = X.shape[0]
-    triplets = []
-
-    row_label = Y[index]
-    neighbour_indexes = np.where(Y == row_label)[0]
-    
-    # Take a random neighbour as positive
-    neighbour_ind = np.random.choice(neighbour_indexes)
-    
-    # Take a random non-neighbour as negative
-    negative_ind = np.random.randint(0, N_ROWS)     # Pick a random index until one fits constraint. An optimization.
-    while negative_ind in neighbour_indexes:
-        negative_ind = np.random.randint(0, N_ROWS)
-    
-    triplets += [[X[index], X[neighbour_ind], X[negative_ind]]]
-    return triplets
-
 def create_triplets_from_positive_index_dict(X, positive_index_dict):
     N_ROWS = X.shape[0]
     triplets = []
diff --git a/ivis/ivis.py b/ivis/ivis.py
@@ -1,6 +1,6 @@
 """ scikit-learn wrapper class for the Ivis algorithm. """
 
-from .data.triplet_generators import create_triplet_generator_from_annoy_index, create_triplet_generator_from_labels
+from .data.triplet_generators import create_triplet_generator_from_annoy_index
 from .nn.network import build_network, selu_base_network
 from .nn.losses import triplet_loss
 from .data.knn import build_annoy_index
@@ -80,26 +80,18 @@ def __init__(self, embedding_dims=2, k=150, distance='pn', batch_size=128, epoch
         self.model_ = model
         self.annoy_index = annoy_index
 
-    def _fit(self, X, y, val_x, val_y, shuffle_mode=True):
-        if y is None:
-            self.annoy_index = self.annoy_index or build_annoy_index(X, ntrees=self.ntrees)
-            datagen = create_triplet_generator_from_annoy_index(X, index=self.annoy_index, k=self.k, batch_size=self.batch_size, search_k=self.search_k, precompute=self.precompute)
-        else:
-            datagen = create_triplet_generator_from_labels(X, y, batch_size=self.batch_size)
+    def _fit(self, X, shuffle_mode=True):
+        
+        self.annoy_index = self.annoy_index or build_annoy_index(X, ntrees=self.ntrees)        
+        datagen = create_triplet_generator_from_annoy_index(X,
+                    index=self.annoy_index,
+                    k=self.k,
+                    batch_size=self.batch_size,
+                    search_k=self.search_k,
+                    precompute=self.precompute)
 
-        val_datagen = None
-        validation_steps = None
         loss_monitor = 'loss'
-        
-        if val_x is not None:
-            if val_y is None:
-                val_index = build_annoy_index(val_x, ntrees=self.ntrees)
-                val_datagen = create_triplet_generator_from_annoy_index(val_x, index=val_index, k=self.k, batch_size=self.batch_size, search_k=self.search_k, precompute=self.precompute)
-            else:
-                val_datagen = create_triplet_generator_from_labels(X, y, batch_size=self.batch_size)
-
-            validation_steps = int(val_x.shape[0] / self.batch_size)
-            loss_monitor = 'val_loss'
+                
         if self.model_:
             model = build_network(self.model_, embedding_dims=self.embedding_dims) 
         else:
@@ -115,42 +107,40 @@ def _fit(self, X, y, val_x, val_y, shuffle_mode=True):
         hist = model.fit_generator(datagen, 
             steps_per_epoch=int(X.shape[0] / self.batch_size), 
             epochs=self.epochs, 
-            callbacks=[EarlyStopping(monitor=loss_monitor, patience=self.n_epochs_without_progress)],
-            validation_data=val_datagen,
-            validation_steps=validation_steps,
+            callbacks=[EarlyStopping(monitor=loss_monitor, patience=self.n_epochs_without_progress)],            
             shuffle=shuffle_mode,
             workers=multiprocessing.cpu_count() )
         self.loss_history_ = hist.history['loss']
         self.model_ = model.layers[3]
 
-    def fit(self, X, y=None, val_x=None, val_y=None, shuffle_mode=True):
-        self._fit(X, y, val_x, val_y, shuffle_mode)
+    def fit(self, X, shuffle_mode=True):
+        self._fit(X, shuffle_mode)
         return self
 
-    def fit_transform(self, X, y=None, val_x=None, val_y=None, shuffle_mode=True):
-        self.fit(X, y, val_x, val_y, shuffle_mode)
+    def fit_transform(self, X, shuffle_mode=True):
+        self.fit(X, shuffle_mode)
         return self.transform(X)
         
     def transform(self, X):
         embedding = self.model_.predict(X)
         return embedding
 
-    def save(self, filepath):
+    def save_model(self, filepath):
         self.model_.save(filepath)
     
-    def load(self, filepath):
+    def load_model(self, filepath):
         model = load_model(filepath)
         self.model_ = model
         self.model_._make_predict_function()
         return self
-
-    def load_index(self, filepath):
-        annoy_index = AnnoyIndex()
-        annoy_index.load(filepath)
-        self.annoy_index = annoy_index
     
     def save_index(self, filepath):
         if self.annoy_index is not None:
             self.annoy_index.save(filepath)
         else:
-            raise Exception('No annoy index to save.')
+            raise Exception('No annoy index to save.')
+    
+    def load_index(self, filepath):
+        annoy_index = AnnoyIndex()
+        annoy_index.load(filepath)
+        self.annoy_index = annoy_index
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
-tensorflow==1.10.0
-keras==2.2.2
-numpy==1.14.3
-scikit-learn==0.20.0
-tqdm==4.19.4
+tensorflow
+keras
+numpy
+scikit-learn>0.20.0
+tqdm
 git+https://github.com/beringresearch/annoy.git#egg=annoy