Skip to content

Commit 222eaef

Browse files
committed
Modifying LDA example to use pretrained word vectors
1 parent 3f0baf9 commit 222eaef

File tree

1 file changed

+8
-2
lines changed
  • examples/twenty_newsgroups

1 file changed

+8
-2
lines changed

examples/twenty_newsgroups/lda.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
texts = [unicode(d) for d in texts]
2525

2626
# Preprocess data
27-
max_length = 10000 # Limit of 1k words per document
27+
max_length = 10000 # Limit of 10k words per document
2828
tokens, vocab = preprocess.tokenize(texts, max_length, tag=False,
2929
parse=False, entity=False)
3030
corpus = Corpus()
@@ -45,14 +45,18 @@
4545
# and OoV words
4646
doc_ids = np.arange(pruned.shape[0])
4747
flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
48+
# Optionally, we can initialize our word vectors from a pretrained
49+
# model. This helps when our corpus is small and we'd like to bootstrap
50+
word_vectors = corpus.compact_word_vectors(vocab)
4851

4952
# Model Parameters
5053
# Number of documents
5154
n_docs = len(texts)
5255
# Number of unique words in the vocabulary
5356
n_words = flattened.max() + 1
5457
# Number of dimensions in a single word vector
55-
n_hidden = 128
58+
# (if using pretrained vectors, should match that dimensionality)
59+
n_hidden = 300
5660
# Number of topics to fit
5761
n_topics = 20
5862
# Get the count for each key
@@ -64,6 +68,8 @@
6468
model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2)
6569
model.add_categorical_feature(n_docs, n_topics, name='document_id')
6670
model.finalize()
71+
# Optional: we can use the pretrained word vectors
72+
model.vocab.W[:, :] = word_vectors
6773
if os.path.exists('model.hdf5'):
6874
serializers.load_hdf5('model.hdf5', model)
6975
for _ in range(200):

0 commit comments

Comments
 (0)