|
4 | 4 | # This simple example loads the newsgroups data from sklearn
|
5 | 5 | # and train an LDA-like model on it
|
6 | 6 | import os.path
|
7 |
| -import logging |
8 | 7 | import pickle
|
9 | 8 | import time
|
10 | 9 |
|
11 |
| -from sklearn.datasets import fetch_20newsgroups |
12 | 10 | from chainer import serializers
|
13 | 11 | import chainer.optimizers as O
|
14 | 12 | import numpy as np
|
15 | 13 |
|
16 |
| -from lda2vec import preprocess, Corpus, utils |
| 14 | +from lda2vec import utils |
17 | 15 | from lda2vec import prepare_topics, print_top_words_per_topic
|
18 | 16 | from simple_lda2vec import LDA2Vec
|
19 | 17 |
|
20 |
| -logging.basicConfig() |
21 |
| - |
22 |
| -# Fetch data |
23 |
| -removes = ('header', 'footer', 'quotes') |
24 |
| -texts = fetch_20newsgroups(subset='train', remove=removes).data |
25 |
| - |
26 |
| - |
27 |
| -def replace(t): |
28 |
| - sep = "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax" |
29 |
| - return t.replace('`@("', '').replace("'ax>", '').replace(sep, '') |
30 |
| - |
31 |
| -# Preprocess data |
32 |
| -max_length = 10000 # Limit of 10k words per document |
33 |
| -if not os.path.exists('doc_ids.npy'): |
34 |
| - # Convert to unicode (spaCy only works with unicode) |
35 |
| - texts = [unicode(replace(d)) for d in texts] |
36 |
| - tokens, vocab = preprocess.tokenize(texts, max_length, merge=True, |
37 |
| - n_threads=4) |
38 |
| - corpus = Corpus() |
39 |
| - # Make a ranked list of rare vs frequent words |
40 |
| - corpus.update_word_count(tokens) |
41 |
| - corpus.finalize() |
42 |
| - # The tokenization uses spaCy indices, and so may have gaps |
43 |
| - # between indices for words that aren't present in our dataset. |
44 |
| - # This builds a new compact index |
45 |
| - compact = corpus.to_compact(tokens) |
46 |
| - # Remove extremely rare words |
47 |
| - pruned = corpus.filter_count(compact, min_count=15) |
48 |
| - # Words tend to have power law frequency, so selectively |
49 |
| - # downsample the most prevalent words |
50 |
| - clean = corpus.subsample_frequent(pruned) |
51 |
| - # Now flatten a 2D array of document per row and word position |
52 |
| - # per column to a 1D array of words. This will also remove skips |
53 |
| - # and OoV words |
54 |
| - doc_ids = np.arange(pruned.shape[0]) |
55 |
| - flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids) |
56 |
| - # Save all of the preprocessed files |
57 |
| - pickle.dump(vocab, open('vocab.pkl', 'w')) |
58 |
| - pickle.dump(corpus, open('corpus.pkl', 'w')) |
59 |
| - np.save("flattened", flattened) |
60 |
| - np.save("doc_ids", doc_ids) |
61 |
| -else: |
62 |
| - vocab = pickle.load(open('vocab.pkl', 'r')) |
63 |
| - corpus = pickle.load(open('corpus.pkl', 'r')) |
64 |
| - flattened = np.load("flattened.npy") |
65 |
| - doc_ids = np.load("doc_ids.npy") |
| 18 | +vocab = pickle.load(open('vocab.pkl', 'r')) |
| 19 | +corpus = pickle.load(open('corpus.pkl', 'r')) |
| 20 | +flattened = np.load("flattened.npy") |
| 21 | +doc_ids = np.load("doc_ids.npy") |
66 | 22 |
|
67 | 23 | # Optionally, we can initialize our word vectors from a pretrained
|
68 | 24 | # model. This helps when our corpus is small and we'd like to bootstrap
|
69 | 25 | word_vectors = corpus.compact_word_vectors(vocab)
|
70 | 26 |
|
71 | 27 | # Model Parameters
|
72 | 28 | # Number of documents
|
73 |
| -n_docs = len(texts) |
| 29 | +n_docs = doc_ids.max() + 1 |
74 | 30 | # Number of unique words in the vocabulary
|
75 | 31 | n_vocab = flattened.max() + 1
|
76 | 32 | # Number of dimensions in a single word vector
|
77 |
| -# (if using pretrained vectors, should match that dimensionality) |
78 | 33 | n_units = 256
|
79 | 34 | # 'Strength' of the dircihlet prior; 200.0 seems to work well
|
80 | 35 | clambda = 200.0
|
|
0 commit comments