commit

jricheimer · jricheimer · commit 847828324ca8 · 2019-04-24T16:23:32.000-07:00
diff --git a/README.md b/README.md
@@ -1,4 +1,31 @@
-# Keras Metric Learning
+# Keras Metric Learning Library
 Deep Metric Learning Library for Keras
 
-## Under Construction...
+## Welcome
+The Keras Metric Learning Library provides the Keras-user with the functionality
+to train models with the metric-learning losses being published in the research
+literature. See this post for more info.
+
+## Getting Started
+Go ahead and clone the repo
+```
+git clone http://github.com/jricheimer/keras-metric-learning
+```
+If you'd like to experiment with the Stanford Online Products dataset (a nice
+size dataset made for testing metric learning approaches), take a minute to
+download it here.
+
+Then, use our script to generate the hdf5 files for the dataset:
+```
+cd keras-metric-learning
+mkdir dataset && cd dataset
+python kml_create_stanford_hdf5.py --root_path /path/to/stanford/dataset
+```
+
+After that's finished processing, you should have two hdf5 files (one for train,
+one for test) in the dataset directory.
+
+## Start training
+
+Take a look at the example notebooks to see how to use the library functionalities.
+Enjoy!
diff --git a/kml_callbacks.py b/kml_callbacks.py
@@ -5,7 +5,7 @@
 from keras.callbacks import Callback
 import numpy as np
 from types import GeneratorType
-from utils import recall_at_k, nmi
+from kml_utils import recall_at_k, nmi
 
 class RecallAtK(Callback):
     """Callback that computes the Recall@k metric for a given validation set at the end of each epoch.
@@ -25,11 +25,12 @@ def __init__(self, validation_data, validation_steps=1, k=1, metric='euclidean',
         self.model_name = model_name
         self.k = k
         self.metric = metric
-        self.validation_data = validation_data
-        self.validation_steps = validation_steps
+        # self.validation_data = validation_data
+        # self.validation_steps = validation_steps
         self.verbose = verbose
 
     def on_epoch_end(self, epoch, logs=None):
+
         logs = logs or {}
         if 'recall_at_{}'.format(self.k) not in logs:
             logs['recall_at_{}'.format(self.k)] = []
@@ -38,24 +39,23 @@ def on_epoch_end(self, epoch, logs=None):
                 self.model = self.model.get_layer(self.model_name)
             else:
                 sub_models = [l for l in self.model.layers if isinstance(l, Model)]
-                if len(sub_models) != 1:
-                    raise ValueError('Training network must contain exactly one sub-model')
-                self.model = sub_models[0]
-        if isinstance(self.validation_data, GeneratorType):
-            val_embeddings = []
-            labels = []
-            for i in range(self.validation_steps):
-                data, targets = self.validation_data.next()
-                val_embeddings.append(self.model.predict(data))
-                labels.extend(targets)
-            val_embeddings = np.concatenate(val_embeddings, axis=0)
+                if len(sub_models) == 1:
+                    self.model = sub_models[0]
+        # if isinstance(self.validation_data, GeneratorType):
+        #     val_embeddings = []
+        #     labels = []
+        #     for i in range(self.validation_steps):
+        #         data, targets = self.validation_data.next()
+        #         val_embeddings.append(self.model.predict(data))
+        #         labels.extend(targets)
+        #     val_embeddings = np.concatenate(val_embeddings, axis=0)
             
-        elif isinstance(self.validation_data, tuple) and len(self.validation_data) == 2:
-            val_embeddings = self.model.predict(self.validation_data[0])
-            labels = self.validation_data[1]
+        # elif isinstance(self.validation_data, tuple) and len(self.validation_data) == 2:
+        val_embeddings = self.model.predict(self.validation_data[0])
+        labels = self.validation_data[1]
 
-        else:
-            raise ValueError('validation_data must be either a generator object or a tuple (X,Y)')
+        # else:
+        #     raise ValueError('validation_data must be either a generator object or a tuple (X,Y)')
 
         recall = recall_at_k(val_embeddings, labels, k=self.k, metric=self.metric)    
         logs['recall_at_{}'.format(self.k)].append(recall)
diff --git a/kml_data_utils.py b/kml_data_utils.py
@@ -60,7 +60,8 @@ def triplet_generator(data, batch_size=32,
         else:
             yield x_list
 
-def pair_generator(data, batch_size):
+def pair_generator(data, batch_size, all_similar=False,
+                                all_dissimilar=False):
     """Generates pair samples randomly for training Siamese network
 
     # Arguments
@@ -71,13 +72,20 @@ def pair_generator(data, batch_size):
         Yields batches of pairs of the form ([batch_1, batch_2], pairwise_labels)
     """
     class_ids = data.keys()
+    if all_dissimilar and all_similar:
+        raise ValueError()
 
     while True:
 
         batch_list_1 = []
         batch_list_2 = []
-
-        labels = np.random.randint(2, size=(batch_size,))
+        
+        if all_similar:
+            labels = np.ones(shape=(batch_size,))
+        elif all_dissimilar:
+            labels = np.zeros(shape=(batch_size,))
+        else:
+            labels = np.random.randint(2, size=(batch_size,))
         for batch_ind in range(batch_size):
             
             if labels[batch_ind] == 1:
@@ -130,3 +138,45 @@ def structured_batch_generator(data, num_classes_per_batch, num_samples_per_clas
             yield (np.stack(batch_list), None)
         else:
             yield np.stack(batch_list)
+
+def random_sample_generator(data, batch_size=32, label_map=None, classes_per_batch=None, class_to_batch_ratio=None):
+    """
+    # Arguments
+        data: dict containing numpy arrays for each class, or h5py Group containing h5py Dataset for each class.
+        batch_size: 
+        label_map: A function that maps the class names (keys of the dataset dict or hdf5 datasets)\
+            to a class index 0 - (num_classes-1).
+        classes_per_batch: restricts the sampling to a provided fixed number of classes in each batch
+        class_to_batch_ratio: Alternative to `classes_per_batch`. If both are specified, `classes_per_batch` will be used.
+
+    # Returns
+        Yields a batch of random samples from the dataset with corresponding class integer labels
+    """
+    class_ids = data.keys()
+    
+    if not label_map:
+        # If the class ids are ints, assume they can be used as labels directly
+        if all([type(i) is int for i in class_ids]) and (max(class_ids) == len(class_ids)-1):
+            label_map = lambda i: i
+        # Otherwise assign its index in the class_ids list
+        else:
+            label_map = lambda i: class_ids.index(i)
+
+    while True:
+        batch_list = []
+        label_list = []
+        if class_to_batch_ratio and not classes_per_batch:
+            classes_per_batch = int(class_to_batch_ratio * batch_size)
+        if not classes_per_batch:
+            batch_class_ids = [rand.choice(class_ids) for _ in range(batch_size)]
+        else:
+            batch_class_ids = rand.sample(class_ids, classes_per_batch)        
+
+        for _ in range(batch_size):
+            class_id = rand.choice(batch_class_ids)
+            sample_ind = np.random.randint(data[class_id].shape[0])
+            batch_list.append(data[class_id][sample_ind,...])
+            # This works for the Stanford products dataset
+            label_list.append(label_map(class_id))
+        
+        yield(np.stack(batch_list), np.stack(label_list))
diff --git a/kml_layers.py b/kml_layers.py
@@ -80,15 +80,20 @@ class PairDistances(Layer):
 
     """
 
-    def __init__(self, epsilon=1e-6, **kwargs):
+    def __init__(self, metric='l2', epsilon=1e-6, **kwargs):
         self.epsilon = epsilon
         super(PairDistances, self).__init__(**kwargs)
 
     def build(self, input_shape):
         super(PairDistances, self).build(input_shape)
 
     def call(self, x):
-        dists = K.sqrt(K.relu(K.sum(K.square(x[0]-x[1]), axis=1))+self.epsilon)
+        if metric == 'l2':
+            dists = K.sqrt(K.relu(K.sum(K.square(x[0]-x[1]), axis=1))+self.epsilon)
+        elif metric == 'l1':
+            dists = K.sum(K.abs(x[0]-x[1]), axis=1)
+        else:
+            raise ValueError()
         return K.expand_dims(dists, axis=-1)
 
 
@@ -262,7 +267,7 @@ def call(self, x):
         F = K.tf.boolean_mask(F, K.tf.logical_not(K.cast(K.eye(2*self.p), K.tf.bool)))
         F =  K.reshape(F, [2*self.p, 2*self.p-1])
 
-        return K.mean(K.categorical_crossentropy(target=self.labels, output=F, from_logits=True))
+        return K.mean(K.categorical_crossentropy(target=self.labels, output=F, from_logits=True)) \
                     + self.reg_coeff * K.mean(embedding_norms)
 
     def compute_output_shape(self, input_shape):
diff --git a/kml_utils.py b/kml_utils.py
@@ -5,6 +5,7 @@
 from scipy.spatial.distance import pdist, squareform
 from sklearn.cluster import KMeans
 from sklearn.metrics import normalized_mutual_info_score
+from kml_data_utils import pair_generator, organize_by_class
 
 def recall_at_k(embeddings, labels, k=1, metric='euclidean'):
     """Computes the Recall@K metric
@@ -39,5 +40,11 @@ def nmi(embeddings, labels, metric='euclidean'):
     kmeans.fit(embeddings)
     return normalized_mutual_info_score(labels, kmeans.labels_)
 
-def plot_distance_distributions(test_data, num_pairs=10000, num_bins=100):
-    pass
+def plot_distance_distributions(embeddings, labels, num_pairs=10000, num_bins=100):
+    negative_distances = []
+    positive_distances = []
+    pos_gen = pair_generator(organize_by_class(embeddings, labels), all_similar=True)
+    while (len(positive_distances) < num_pairs/2):
+        pairs = pos_gen.next()
+        
+