@@ -473,6 +473,68 @@ def word_list(self, vocab, max_compact_index=None, oov_token='<OoV>'):
473
473
words .append (string )
474
474
return words
475
475
476
+ def compact_word_vectors (self , vocab , array = None ):
477
+ """ Retrieve pretrained SpaCy word spectors for our vocabulary.
478
+ The returned word array has row indices corresponding to the
479
+ compact index of a word, and columns correponding to the word
480
+ vector.
481
+
482
+ Arguments
483
+ ---------
484
+ vocab : dict
485
+ Dictionary where keys are the loose index, and values are
486
+ the word string.
487
+
488
+ Returns
489
+ -------
490
+ data : numpy float array
491
+ Array such that data[compact_index, :] = word_vector
492
+
493
+ Examples
494
+ --------
495
+ >>> import numpy.linalg as nl
496
+ >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
497
+ >>> word_indices = np.zeros(50).astype('int32')
498
+ >>> word_indices[:25] = 19 # 'Shuttle' shows 25 times
499
+ >>> word_indices[25:35] = 5 # 'astronomy' is in 10 times
500
+ >>> word_indices[40:46] = 7 # 'cold' is in 6 times
501
+ >>> word_indices[46:] = 3 # 'hot' is in 3 times
502
+ >>> corpus = Corpus()
503
+ >>> corpus.update_word_count(word_indices)
504
+ >>> corpus.finalize()
505
+ >>> v = corpus.compact_word_vectors(vocab)
506
+ >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
507
+ >>> vocab[corpus.compact_to_loose[2]]
508
+ 'shuttle'
509
+ >>> vocab[corpus.compact_to_loose[3]]
510
+ 'astronomy'
511
+ >>> vocab[corpus.compact_to_loose[4]]
512
+ 'cold'
513
+ >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
514
+ >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
515
+ >>> sim_shuttle_astro > sim_shuttle_cold
516
+ True
517
+ """
518
+ import spacy .en
519
+ nlp = spacy .en .English ()
520
+ data = None
521
+ if array :
522
+ data = array
523
+ n_words = len (self .compact_to_loose )
524
+ for compact , loose in self .compact_to_loose .items ():
525
+ word = vocab .get (loose , None )
526
+ if word is None :
527
+ continue
528
+ token , = nlp (unicode (word ))
529
+ if not token .has_vector :
530
+ continue
531
+ vector = token .vector
532
+ if data is None :
533
+ n_dim = vector .shape [0 ]
534
+ data = np .zeros ((n_words , n_dim ), dtype = 'float32' )
535
+ data [compact , :] = vector [:]
536
+ return data
537
+
476
538
477
539
def fast_replace (data , keys , values , skip_checks = False ):
478
540
""" Do a search-and-replace in array `data`.
0 commit comments