Skip to content

Commit 3f0baf9

Browse files
committed
Added loading word vectors from spacy
1 parent 1d3c5aa commit 3f0baf9

File tree

1 file changed

+62
-0
lines changed

1 file changed

+62
-0
lines changed

lda2vec/corpus.py

+62
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,68 @@ def word_list(self, vocab, max_compact_index=None, oov_token='<OoV>'):
473473
words.append(string)
474474
return words
475475

476+
def compact_word_vectors(self, vocab, array=None):
477+
""" Retrieve pretrained SpaCy word spectors for our vocabulary.
478+
The returned word array has row indices corresponding to the
479+
compact index of a word, and columns correponding to the word
480+
vector.
481+
482+
Arguments
483+
---------
484+
vocab : dict
485+
Dictionary where keys are the loose index, and values are
486+
the word string.
487+
488+
Returns
489+
-------
490+
data : numpy float array
491+
Array such that data[compact_index, :] = word_vector
492+
493+
Examples
494+
--------
495+
>>> import numpy.linalg as nl
496+
>>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
497+
>>> word_indices = np.zeros(50).astype('int32')
498+
>>> word_indices[:25] = 19 # 'Shuttle' shows 25 times
499+
>>> word_indices[25:35] = 5 # 'astronomy' is in 10 times
500+
>>> word_indices[40:46] = 7 # 'cold' is in 6 times
501+
>>> word_indices[46:] = 3 # 'hot' is in 3 times
502+
>>> corpus = Corpus()
503+
>>> corpus.update_word_count(word_indices)
504+
>>> corpus.finalize()
505+
>>> v = corpus.compact_word_vectors(vocab)
506+
>>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
507+
>>> vocab[corpus.compact_to_loose[2]]
508+
'shuttle'
509+
>>> vocab[corpus.compact_to_loose[3]]
510+
'astronomy'
511+
>>> vocab[corpus.compact_to_loose[4]]
512+
'cold'
513+
>>> sim_shuttle_astro = sim(v[2, :], v[3, :])
514+
>>> sim_shuttle_cold = sim(v[2, :], v[4, :])
515+
>>> sim_shuttle_astro > sim_shuttle_cold
516+
True
517+
"""
518+
import spacy.en
519+
nlp = spacy.en.English()
520+
data = None
521+
if array:
522+
data = array
523+
n_words = len(self.compact_to_loose)
524+
for compact, loose in self.compact_to_loose.items():
525+
word = vocab.get(loose, None)
526+
if word is None:
527+
continue
528+
token, = nlp(unicode(word))
529+
if not token.has_vector:
530+
continue
531+
vector = token.vector
532+
if data is None:
533+
n_dim = vector.shape[0]
534+
data = np.zeros((n_words, n_dim), dtype='float32')
535+
data[compact, :] = vector[:]
536+
return data
537+
476538

477539
def fast_replace(data, keys, values, skip_checks=False):
478540
""" Do a search-and-replace in array `data`.

0 commit comments

Comments
 (0)