Skip to content

Commit 5472015

Browse files
committed
TensorFlow word2vec back
1 parent 00cda75 commit 5472015

File tree

2 files changed

+212
-0
lines changed

2 files changed

+212
-0
lines changed

tensorflow_word2vec/presentation.pdf

598 KB
Binary file not shown.
+212
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
"""
4+
Simplified Word2Vec implementation in Tensorflow
5+
6+
"""
7+
8+
import tensorflow as tf
9+
import nltk
10+
import math
11+
import os
12+
import time
13+
14+
import numpy as np
15+
from collections import defaultdict
16+
17+
from tensorflow.contrib.tensorboard.plugins import projector
18+
19+
# You can use tensorflows FLAGS to generate program options with dfefaults, so that you can change parameters from the commandline
20+
21+
tf.flags.DEFINE_integer("num_neg_samples", 4, "Number of negative samples")
22+
tf.flags.DEFINE_integer("steps", 100000, "Number of training steps")
23+
tf.flags.DEFINE_integer("learning_rate", 1.0, "Number of training steps")
24+
tf.flags.DEFINE_float("embedding_size", 100, "Size of the embedding")
25+
tf.flags.DEFINE_boolean("lower_case", True, "Whether the corpus should be lowercased")
26+
tf.flags.DEFINE_boolean("skip_gram", True, "Whether skip gram should be used or CBOW")
27+
tf.flags.DEFINE_integer("min_frequency" , 3 , "Words that occur lower than this frequency are discarded as OOV")
28+
29+
FLAGS = tf.flags.FLAGS
30+
31+
def ensure_dir(file_path):
32+
directory = os.path.dirname(file_path)
33+
if not os.path.exists(directory):
34+
os.makedirs(directory)
35+
36+
def build_graph(vocabulary_size, num_sampled, embedding_size, learning_rate):
37+
# Placeholders for inputs
38+
contexts = tf.placeholder(tf.int32, shape=[None])
39+
targets = tf.placeholder(tf.int32, shape=[None, 1])
40+
41+
embeddings = tf.Variable(
42+
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
43+
44+
nce_weights = tf.Variable(
45+
tf.truncated_normal([vocabulary_size, embedding_size],
46+
stddev=1.0 / math.sqrt(embedding_size)))
47+
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
48+
49+
embed = tf.nn.embedding_lookup(embeddings, contexts)
50+
51+
# Compute the NCE loss, using a sample of the negative labels each time.
52+
loss = tf.reduce_mean(
53+
tf.nn.nce_loss(weights=nce_weights,
54+
biases=nce_biases,
55+
labels=targets,
56+
inputs=embed,
57+
num_sampled=num_sampled,
58+
num_classes=vocabulary_size))
59+
60+
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
61+
#optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
62+
63+
return embeddings, contexts, targets, optimizer, loss
64+
65+
def generate_batch(corpus, batch_size, skip_gram=True):
66+
67+
contexts = np.ndarray(shape=(batch_size*2), dtype=np.int32)
68+
targets = np.ndarray(shape=(batch_size*2, 1), dtype=np.int32)
69+
70+
for i in range(batch_size):
71+
random_token_num = int(math.floor(np.random.random_sample() * (len(corpus) -2))) +1
72+
73+
# E.g. for "the quick brown fox jumped over the lazy dog"
74+
# (context, target) pairs: ([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox)
75+
# We can simplify to: (the, quick), (brown, quick), (quick, brown), (fox, brown), ... CBOW
76+
# => contexts is ids of [the, brown, quick, fox, ...], labels/targets: [quick, quick, brown, brown, ...]
77+
# (quick, the), (quick, brown), (brown, quick), (brown, fox), ... Skip-gram
78+
# => contexts and targets reversed
79+
80+
# left context pair
81+
left = [corpus[random_token_num-1], corpus[random_token_num]]
82+
# right context pair
83+
right = [corpus[random_token_num+1], corpus[random_token_num]]
84+
85+
if skip_gram:
86+
left.reverse()
87+
right.reverse()
88+
89+
contexts[i*2] = left[0]
90+
contexts[i*2+1] = right[0]
91+
92+
targets[i*2] = left[1]
93+
targets[i*2+1] = right[1]
94+
95+
return contexts, targets
96+
97+
# load a text file, tokenize it, count occurences and build a word encoder that translate a word into a unique id (sorted by word frequency)
98+
def load_corpus(filename='t8.shakespeare.txt', lower_case=True, min_frequency=3):
99+
corpus = []
100+
101+
i=0
102+
with open(filename, 'r') as in_file:
103+
for line in in_file:
104+
if i % 1000 == 0:
105+
print('Loading ',filename,', processing line',i)
106+
107+
if line[-1]=='\n':
108+
line = line[:-1]
109+
line = line.strip()
110+
if lower_case:
111+
line = line.lower()
112+
113+
# You need to run nltk.download('punkt') for this to work:
114+
corpus += nltk.word_tokenize(line)
115+
116+
i+=1
117+
118+
print('compute word encoder...')
119+
word_counter = defaultdict(int)
120+
121+
for word in corpus:
122+
word_counter[word] += 1
123+
124+
word_counter = list(word_counter.items())
125+
word_counter = [elem for elem in word_counter if elem[1] >= min_frequency]
126+
word_counter.sort(key=lambda x:x[1], reverse=True)
127+
128+
word_encoder = defaultdict(int)
129+
130+
for i,elem in enumerate(word_counter):
131+
word_encoder[elem[0]] = i
132+
133+
print('done')
134+
135+
return corpus, word_encoder
136+
137+
def train(corpus, word_encoder, vocabulary_size, num_samples, steps):
138+
with tf.device('/cpu'):
139+
with tf.Session() as sess:
140+
embeddings, contexts, targets, optimizer, loss = build_graph(vocabulary_size, num_samples,
141+
FLAGS.embedding_size, FLAGS.learning_rate)
142+
143+
## summary ops
144+
timestamp = str(int(time.time()))
145+
train_summary_dir = os.path.join('./', 'w2v_summaries_' + timestamp) + '/'
146+
ensure_dir(train_summary_dir)
147+
print('Writing summaries and checkpoints to logdir:' + train_summary_dir)
148+
model_ckpt_file = os.path.join('./w2v_summaries_'+ timestamp + '/', 'model.ckpt')
149+
vocab_file = os.path.join(train_summary_dir, 'metadata.tsv')
150+
151+
vocab_items = list(word_encoder.items())
152+
vocab_items.sort(key=lambda x:x[1])
153+
print(vocab_items[:100])
154+
vocab_list = [elem[0] for elem in vocab_items if elem[1] > 0]
155+
156+
with open(vocab_file, 'w') as vocab_file_out:
157+
vocab_file_out.write('<UNK>'+'\n')
158+
for word in vocab_list:
159+
vocab_file_out.write(word+'\n')
160+
161+
loss_summary = tf.summary.scalar('loss', loss)
162+
config = projector.ProjectorConfig()
163+
embedding = config.embeddings.add()
164+
embedding.tensor_name = embeddings.name
165+
embedding.metadata_path = vocab_file
166+
train_summary_op = tf.summary.merge_all()
167+
168+
summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)
169+
projector.visualize_embeddings(summary_writer, config)
170+
171+
saver = tf.train.Saver(tf.global_variables())
172+
173+
## initalize parameters
174+
sess.run(tf.global_variables_initializer())
175+
176+
losses = []
177+
178+
## now do batched SGD training
179+
for current_step in range(steps):
180+
inputs, labels = generate_batch(corpus, batch_size=32, skip_gram=FLAGS.skip_gram)
181+
feed_dict = {contexts: inputs, targets: labels}
182+
_, cur_loss = sess.run([optimizer, loss], feed_dict=feed_dict)
183+
184+
losses.append(cur_loss)
185+
186+
if current_step % 100==0 and current_step != 0:
187+
summary_str = sess.run(train_summary_op, feed_dict=feed_dict)
188+
summary_writer.add_summary(summary_str, current_step)
189+
190+
if current_step % 1000 == 0:
191+
print('step',current_step,'mean loss:', np.mean(np.asarray(losses)))
192+
saver.save(sess, model_ckpt_file, current_step)
193+
losses = []
194+
195+
embeddings_np = sess.run(embeddings)
196+
print('embedding matrix:', embeddings_np)
197+
198+
# implement your neighboor search here
199+
200+
201+
if __name__ == "__main__":
202+
corpus, word_encoder = load_corpus(lower_case=FLAGS.lower_case, min_frequency=FLAGS.min_frequency)
203+
204+
205+
corpus_num = [word_encoder[word] for word in corpus]
206+
207+
print('First few tokens of corpus:', corpus[:100])
208+
print('First few tokens of corpus_num:', list(corpus_num[:100]))
209+
210+
corpus_num = np.asarray(corpus_num)
211+
212+
train(corpus_num, word_encoder, vocabulary_size=max(corpus_num)+1, num_samples=FLAGS.num_neg_samples , steps=FLAGS.steps)

0 commit comments

Comments
 (0)