|
24 | 24 | texts = [unicode(d) for d in texts]
|
25 | 25 |
|
26 | 26 | # Preprocess data
|
27 |
| -max_length = 10000 # Limit of 1k words per document |
| 27 | +max_length = 10000 # Limit of 10k words per document |
28 | 28 | tokens, vocab = preprocess.tokenize(texts, max_length, tag=False,
|
29 | 29 | parse=False, entity=False)
|
30 | 30 | corpus = Corpus()
|
|
45 | 45 | # and OoV words
|
46 | 46 | doc_ids = np.arange(pruned.shape[0])
|
47 | 47 | flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
|
| 48 | +# Optionally, we can initialize our word vectors from a pretrained |
| 49 | +# model. This helps when our corpus is small and we'd like to bootstrap |
| 50 | +word_vectors = corpus.compact_word_vectors(vocab) |
48 | 51 |
|
49 | 52 | # Model Parameters
|
50 | 53 | # Number of documents
|
51 | 54 | n_docs = len(texts)
|
52 | 55 | # Number of unique words in the vocabulary
|
53 | 56 | n_words = flattened.max() + 1
|
54 | 57 | # Number of dimensions in a single word vector
|
55 |
| -n_hidden = 128 |
| 58 | +# (if using pretrained vectors, should match that dimensionality) |
| 59 | +n_hidden = 300 |
56 | 60 | # Number of topics to fit
|
57 | 61 | n_topics = 20
|
58 | 62 | # Get the count for each key
|
|
64 | 68 | model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2)
|
65 | 69 | model.add_categorical_feature(n_docs, n_topics, name='document_id')
|
66 | 70 | model.finalize()
|
| 71 | +# Optional: we can use the pretrained word vectors |
| 72 | +model.vocab.W[:, :] = word_vectors |
67 | 73 | if os.path.exists('model.hdf5'):
|
68 | 74 | serializers.load_hdf5('model.hdf5', model)
|
69 | 75 | for _ in range(200):
|
|
0 commit comments