Skip to content

Commit 5bf8acc

Browse files
author
Ubuntu
committed
Removing monolithic lda2vec
1 parent 8618db2 commit 5bf8acc

File tree

9 files changed

+58
-1444
lines changed

9 files changed

+58
-1444
lines changed

examples/twenty_newsgroups/lda.ipynb

-477
This file was deleted.

examples/twenty_newsgroups/lda.py

-104
This file was deleted.

simple/model.py renamed to examples/twenty_newsgroups/model.py

+6-51
Original file line numberDiff line numberDiff line change
@@ -4,77 +4,32 @@
44
# This simple example loads the newsgroups data from sklearn
55
# and train an LDA-like model on it
66
import os.path
7-
import logging
87
import pickle
98
import time
109

11-
from sklearn.datasets import fetch_20newsgroups
1210
from chainer import serializers
1311
import chainer.optimizers as O
1412
import numpy as np
1513

16-
from lda2vec import preprocess, Corpus, utils
14+
from lda2vec import utils
1715
from lda2vec import prepare_topics, print_top_words_per_topic
1816
from simple_lda2vec import LDA2Vec
1917

20-
logging.basicConfig()
21-
22-
# Fetch data
23-
removes = ('header', 'footer', 'quotes')
24-
texts = fetch_20newsgroups(subset='train', remove=removes).data
25-
26-
27-
def replace(t):
28-
sep = "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax"
29-
return t.replace('`@("', '').replace("'ax>", '').replace(sep, '')
30-
31-
# Preprocess data
32-
max_length = 10000 # Limit of 10k words per document
33-
if not os.path.exists('doc_ids.npy'):
34-
# Convert to unicode (spaCy only works with unicode)
35-
texts = [unicode(replace(d)) for d in texts]
36-
tokens, vocab = preprocess.tokenize(texts, max_length, merge=True,
37-
n_threads=4)
38-
corpus = Corpus()
39-
# Make a ranked list of rare vs frequent words
40-
corpus.update_word_count(tokens)
41-
corpus.finalize()
42-
# The tokenization uses spaCy indices, and so may have gaps
43-
# between indices for words that aren't present in our dataset.
44-
# This builds a new compact index
45-
compact = corpus.to_compact(tokens)
46-
# Remove extremely rare words
47-
pruned = corpus.filter_count(compact, min_count=15)
48-
# Words tend to have power law frequency, so selectively
49-
# downsample the most prevalent words
50-
clean = corpus.subsample_frequent(pruned)
51-
# Now flatten a 2D array of document per row and word position
52-
# per column to a 1D array of words. This will also remove skips
53-
# and OoV words
54-
doc_ids = np.arange(pruned.shape[0])
55-
flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
56-
# Save all of the preprocessed files
57-
pickle.dump(vocab, open('vocab.pkl', 'w'))
58-
pickle.dump(corpus, open('corpus.pkl', 'w'))
59-
np.save("flattened", flattened)
60-
np.save("doc_ids", doc_ids)
61-
else:
62-
vocab = pickle.load(open('vocab.pkl', 'r'))
63-
corpus = pickle.load(open('corpus.pkl', 'r'))
64-
flattened = np.load("flattened.npy")
65-
doc_ids = np.load("doc_ids.npy")
18+
vocab = pickle.load(open('vocab.pkl', 'r'))
19+
corpus = pickle.load(open('corpus.pkl', 'r'))
20+
flattened = np.load("flattened.npy")
21+
doc_ids = np.load("doc_ids.npy")
6622

6723
# Optionally, we can initialize our word vectors from a pretrained
6824
# model. This helps when our corpus is small and we'd like to bootstrap
6925
word_vectors = corpus.compact_word_vectors(vocab)
7026

7127
# Model Parameters
7228
# Number of documents
73-
n_docs = len(texts)
29+
n_docs = doc_ids.max() + 1
7430
# Number of unique words in the vocabulary
7531
n_vocab = flattened.max() + 1
7632
# Number of dimensions in a single word vector
77-
# (if using pretrained vectors, should match that dimensionality)
7833
n_units = 256
7934
# 'Strength' of the dircihlet prior; 200.0 seems to work well
8035
clambda = 200.0
+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Author: Chris Moody <[email protected]>
2+
# License: MIT
3+
4+
# This simple example loads the newsgroups data from sklearn
5+
# and train an LDA-like model on it
6+
import logging
7+
import pickle
8+
9+
from sklearn.datasets import fetch_20newsgroups
10+
import numpy as np
11+
12+
from lda2vec import preprocess, Corpus
13+
14+
logging.basicConfig()
15+
16+
# Fetch data
17+
texts = fetch_20newsgroups(subset='train').data
18+
19+
20+
def replace(t):
21+
sep = "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax"
22+
return t.replace('`@("', '').replace("'ax>", '').replace(sep, '')
23+
24+
# Preprocess data
25+
max_length = 10000 # Limit of 10k words per document
26+
# Convert to unicode (spaCy only works with unicode)
27+
texts = [unicode(replace(d)) for d in texts]
28+
tokens, vocab = preprocess.tokenize(texts, max_length, merge=True,
29+
n_threads=4)
30+
corpus = Corpus()
31+
# Make a ranked list of rare vs frequent words
32+
corpus.update_word_count(tokens)
33+
corpus.finalize()
34+
# The tokenization uses spaCy indices, and so may have gaps
35+
# between indices for words that aren't present in our dataset.
36+
# This builds a new compact index
37+
compact = corpus.to_compact(tokens)
38+
# Remove extremely rare words
39+
pruned = corpus.filter_count(compact, min_count=15)
40+
# Words tend to have power law frequency, so selectively
41+
# downsample the most prevalent words
42+
clean = corpus.subsample_frequent(pruned)
43+
# Now flatten a 2D array of document per row and word position
44+
# per column to a 1D array of words. This will also remove skips
45+
# and OoV words
46+
doc_ids = np.arange(pruned.shape[0])
47+
flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
48+
# Save all of the preprocessed files
49+
pickle.dump(vocab, open('vocab.pkl', 'w'))
50+
pickle.dump(corpus, open('corpus.pkl', 'w'))
51+
np.save("flattened", flattened)
52+
np.save("doc_ids", doc_ids)
-3.09 MB
Binary file not shown.

lda2vec/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
dirichlet_likelihood = dirichlet_likelihood.dirichlet_likelihood
1010
EmbedMixture = embed_mixture.EmbedMixture
11-
LDA2Vec = lda2vec.LDA2Vec
1211
Tracking = tracking.Tracking
1312
tokenize = preprocess.tokenize
1413
Corpus = corpus.Corpus

0 commit comments

Comments
 (0)