From c4528815b37378e29c8778efdf45946664e3a5cc Mon Sep 17 00:00:00 2001 From: zfang Date: Wed, 6 Feb 2019 21:16:17 -0800 Subject: [PATCH 1/4] Update __init__.py `_query_is_cached` will always returns false because `_cache.get` expects `key` to be in a tuple. This renders the caching useless. --- pymagnitude/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pymagnitude/__init__.py b/pymagnitude/__init__.py index c60f44b..31b1c69 100644 --- a/pymagnitude/__init__.py +++ b/pymagnitude/__init__.py @@ -1308,8 +1308,8 @@ def _query_numpy(self, key, contextualize=False, normalized=None): def _query_is_cached(self, key, normalized=None): """Checks if the query been cached by Magnitude.""" normalized = normalized if normalized is not None else self.normalized - return ((self._vector_for_key_cached._cache.get((key, frozenset([('normalized', normalized)]))) is not None) or ( # noqa - self._out_of_vocab_vector_cached._cache.get((key, frozenset([('normalized', normalized)]))) is not None)) # noqa + return ((self._vector_for_key_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) is not None) or ( # noqa + self._out_of_vocab_vector_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) is not None)) # noqa @lru_cache(DEFAULT_LRU_CACHE_SIZE, ignore_unhashable_args=True) def distance(self, key, q): From 1043fc1112d87dff233334b89d39bd335c138358 Mon Sep 17 00:00:00 2001 From: Felix Fang Date: Fri, 8 Feb 2019 23:10:11 -0800 Subject: [PATCH 2/4] Fix another caching issue with _out_of_vocab_vector_cached --- pymagnitude/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pymagnitude/__init__.py b/pymagnitude/__init__.py index 31b1c69..8377cc5 100644 --- a/pymagnitude/__init__.py +++ b/pymagnitude/__init__.py @@ -1067,7 +1067,7 @@ def _vectors_for_keys_cached(self, keys, normalized=None, force=False): keys = [self._key_t(key) for key in keys] return self._process_lm_output(keys, normalized) unseen_keys = tuple( - key for key in keys if not self._query_is_cached(key, normalized)) + key for key in keys if not self._query_is_cached(key, normalized=normalized, force=force)) unseen_keys_map = {} if len(unseen_keys) > 0: unseen_keys_map = {self._key_t(k): i for i, k in @@ -1305,11 +1305,11 @@ def _query_numpy(self, key, contextualize=False, normalized=None): else: return r_val - def _query_is_cached(self, key, normalized=None): + def _query_is_cached(self, key, normalized=None, force=False): """Checks if the query been cached by Magnitude.""" normalized = normalized if normalized is not None else self.normalized return ((self._vector_for_key_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) is not None) or ( # noqa - self._out_of_vocab_vector_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) is not None)) # noqa + self._out_of_vocab_vector_cached._cache.get(((key,), frozenset([('normalized', normalized), ('force', force)]))) is not None)) # noqa @lru_cache(DEFAULT_LRU_CACHE_SIZE, ignore_unhashable_args=True) def distance(self, key, q): From 25cf26cf06e375d5b58491889242a110e6a9dd82 Mon Sep 17 00:00:00 2001 From: Felix Fang Date: Sat, 9 Feb 2019 00:38:42 -0800 Subject: [PATCH 3/4] Fix calls args and kwargs inconsistencies because lru_cache treats them differently --- pymagnitude/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pymagnitude/__init__.py b/pymagnitude/__init__.py index 8377cc5..7bd0482 100644 --- a/pymagnitude/__init__.py +++ b/pymagnitude/__init__.py @@ -1208,9 +1208,9 @@ def query(self, q, pad_to_length=None, truncate_left = truncate_left or self.truncate_left if not isinstance(q, list): # Single key - vec = self._vector_for_key_cached(q, normalized) + vec = self._vector_for_key_cached(q, normalized=normalized) if vec is None: - return self._out_of_vocab_vector_cached(q, normalized) + return self._out_of_vocab_vector_cached(q, normalized, force=False) else: return vec elif isinstance(q, list) \ @@ -1308,8 +1308,8 @@ def _query_numpy(self, key, contextualize=False, normalized=None): def _query_is_cached(self, key, normalized=None, force=False): """Checks if the query been cached by Magnitude.""" normalized = normalized if normalized is not None else self.normalized - return ((self._vector_for_key_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) is not None) or ( # noqa - self._out_of_vocab_vector_cached._cache.get(((key,), frozenset([('normalized', normalized), ('force', force)]))) is not None)) # noqa + return ((self._vector_for_key_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) is not None) # noqa + or (self._out_of_vocab_vector_cached._cache.get(((key, normalized), frozenset([('force', force)]))) is not None)) # noqa @lru_cache(DEFAULT_LRU_CACHE_SIZE, ignore_unhashable_args=True) def distance(self, key, q): @@ -1906,7 +1906,7 @@ def __len__(self): def __contains__(self, key): """Checks whether a key exists in the vectors""" - return self._vector_for_key_cached(key) is not None + return self._vector_for_key_cached(key, normalized=self.normalized) is not None def __getitem__(self, q): """Performs the index method when indexed.""" From fd5f1325d4bdebef6cfdbe756b287c33199c3f40 Mon Sep 17 00:00:00 2001 From: Felix Fang Date: Sat, 9 Feb 2019 01:34:27 -0800 Subject: [PATCH 4/4] Use looked up vectors directly instead of calling query; fix _out_of_vocab_vector_cached._cache.get call --- pymagnitude/__init__.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/pymagnitude/__init__.py b/pymagnitude/__init__.py index 7bd0482..c115d02 100644 --- a/pymagnitude/__init__.py +++ b/pymagnitude/__init__.py @@ -1066,8 +1066,9 @@ def _vectors_for_keys_cached(self, keys, normalized=None, force=False): if self._is_lm() and not force: keys = [self._key_t(key) for key in keys] return self._process_lm_output(keys, normalized) + cached_vectors = {key: self._query_cached(key, normalized, force) for key in keys} unseen_keys = tuple( - key for key in keys if not self._query_is_cached(key, normalized=normalized, force=force)) + key for key in keys if cached_vectors[key] is None) unseen_keys_map = {} if len(unseen_keys) > 0: unseen_keys_map = {self._key_t(k): i for i, k in @@ -1106,8 +1107,8 @@ def _vectors_for_keys_cached(self, keys, normalized=None, force=False): unseen_vectors[i]) if unseen_vectors[i] is None: unseen_vectors[i] = self._out_of_vocab_vector_cached( - unseen_keys[i], normalized, force=force) - vectors = [self.query(key, normalized=normalized) + unseen_keys[i], normalized=normalized, force=force) + vectors = [cached_vectors[key] if key not in unseen_keys_map else unseen_vectors[unseen_keys_map[self._key_t(key)]] for key in keys] @@ -1210,7 +1211,7 @@ def query(self, q, pad_to_length=None, if not isinstance(q, list): # Single key vec = self._vector_for_key_cached(q, normalized=normalized) if vec is None: - return self._out_of_vocab_vector_cached(q, normalized, force=False) + return self._out_of_vocab_vector_cached(q, normalized=normalized) else: return vec elif isinstance(q, list) \ @@ -1305,11 +1306,13 @@ def _query_numpy(self, key, contextualize=False, normalized=None): else: return r_val - def _query_is_cached(self, key, normalized=None, force=False): + def _query_cached(self, key, normalized=None, force=False): """Checks if the query been cached by Magnitude.""" normalized = normalized if normalized is not None else self.normalized - return ((self._vector_for_key_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) is not None) # noqa - or (self._out_of_vocab_vector_cached._cache.get(((key, normalized), frozenset([('force', force)]))) is not None)) # noqa + cached = self._vector_for_key_cached._cache.get(((key,), frozenset([('normalized', normalized)]))) + if cached is not None: + return cached + return self._out_of_vocab_vector_cached._cache.get(((key,), frozenset([('normalized', normalized), ('force', force)]))) @lru_cache(DEFAULT_LRU_CACHE_SIZE, ignore_unhashable_args=True) def distance(self, key, q):