|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | +""" |
| 4 | +Simplified Word2Vec implementation in Tensorflow |
| 5 | +
|
| 6 | +""" |
| 7 | + |
| 8 | +import tensorflow as tf |
| 9 | +import nltk |
| 10 | +import math |
| 11 | +import os |
| 12 | +import time |
| 13 | + |
| 14 | +import numpy as np |
| 15 | +from collections import defaultdict |
| 16 | + |
| 17 | +from tensorflow.contrib.tensorboard.plugins import projector |
| 18 | + |
| 19 | +# You can use tensorflows FLAGS to generate program options with dfefaults, so that you can change parameters from the commandline |
| 20 | + |
| 21 | +tf.flags.DEFINE_integer("num_neg_samples", 4, "Number of negative samples") |
| 22 | +tf.flags.DEFINE_integer("steps", 100000, "Number of training steps") |
| 23 | +tf.flags.DEFINE_integer("learning_rate", 1.0, "Number of training steps") |
| 24 | +tf.flags.DEFINE_float("embedding_size", 100, "Size of the embedding") |
| 25 | +tf.flags.DEFINE_boolean("lower_case", True, "Whether the corpus should be lowercased") |
| 26 | +tf.flags.DEFINE_boolean("skip_gram", True, "Whether skip gram should be used or CBOW") |
| 27 | +tf.flags.DEFINE_integer("min_frequency" , 3 , "Words that occur lower than this frequency are discarded as OOV") |
| 28 | + |
| 29 | +FLAGS = tf.flags.FLAGS |
| 30 | + |
| 31 | +def ensure_dir(file_path): |
| 32 | + directory = os.path.dirname(file_path) |
| 33 | + if not os.path.exists(directory): |
| 34 | + os.makedirs(directory) |
| 35 | + |
| 36 | +def build_graph(vocabulary_size, num_sampled, embedding_size, learning_rate): |
| 37 | + # Placeholders for inputs |
| 38 | + contexts = tf.placeholder(tf.int32, shape=[None]) |
| 39 | + targets = tf.placeholder(tf.int32, shape=[None, 1]) |
| 40 | + |
| 41 | + embeddings = tf.Variable( |
| 42 | + tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) |
| 43 | + |
| 44 | + nce_weights = tf.Variable( |
| 45 | + tf.truncated_normal([vocabulary_size, embedding_size], |
| 46 | + stddev=1.0 / math.sqrt(embedding_size))) |
| 47 | + nce_biases = tf.Variable(tf.zeros([vocabulary_size])) |
| 48 | + |
| 49 | + embed = tf.nn.embedding_lookup(embeddings, contexts) |
| 50 | + |
| 51 | + # Compute the NCE loss, using a sample of the negative labels each time. |
| 52 | + loss = tf.reduce_mean( |
| 53 | + tf.nn.nce_loss(weights=nce_weights, |
| 54 | + biases=nce_biases, |
| 55 | + labels=targets, |
| 56 | + inputs=embed, |
| 57 | + num_sampled=num_sampled, |
| 58 | + num_classes=vocabulary_size)) |
| 59 | + |
| 60 | + optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss) |
| 61 | + #optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) |
| 62 | + |
| 63 | + return embeddings, contexts, targets, optimizer, loss |
| 64 | + |
| 65 | +def generate_batch(corpus, batch_size, skip_gram=True): |
| 66 | + |
| 67 | + contexts = np.ndarray(shape=(batch_size*2), dtype=np.int32) |
| 68 | + targets = np.ndarray(shape=(batch_size*2, 1), dtype=np.int32) |
| 69 | + |
| 70 | + for i in range(batch_size): |
| 71 | + random_token_num = int(math.floor(np.random.random_sample() * (len(corpus) -2))) +1 |
| 72 | + |
| 73 | + # E.g. for "the quick brown fox jumped over the lazy dog" |
| 74 | + # (context, target) pairs: ([the, brown], quick), ([quick, fox], brown), ([brown, jumped], fox) |
| 75 | + # We can simplify to: (the, quick), (brown, quick), (quick, brown), (fox, brown), ... CBOW |
| 76 | + # => contexts is ids of [the, brown, quick, fox, ...], labels/targets: [quick, quick, brown, brown, ...] |
| 77 | + # (quick, the), (quick, brown), (brown, quick), (brown, fox), ... Skip-gram |
| 78 | + # => contexts and targets reversed |
| 79 | + |
| 80 | + # left context pair |
| 81 | + left = [corpus[random_token_num-1], corpus[random_token_num]] |
| 82 | + # right context pair |
| 83 | + right = [corpus[random_token_num+1], corpus[random_token_num]] |
| 84 | + |
| 85 | + if skip_gram: |
| 86 | + left.reverse() |
| 87 | + right.reverse() |
| 88 | + |
| 89 | + contexts[i*2] = left[0] |
| 90 | + contexts[i*2+1] = right[0] |
| 91 | + |
| 92 | + targets[i*2] = left[1] |
| 93 | + targets[i*2+1] = right[1] |
| 94 | + |
| 95 | + return contexts, targets |
| 96 | + |
| 97 | +# load a text file, tokenize it, count occurences and build a word encoder that translate a word into a unique id (sorted by word frequency) |
| 98 | +def load_corpus(filename='t8.shakespeare.txt', lower_case=True, min_frequency=3): |
| 99 | + corpus = [] |
| 100 | + |
| 101 | + i=0 |
| 102 | + with open(filename, 'r') as in_file: |
| 103 | + for line in in_file: |
| 104 | + if i % 1000 == 0: |
| 105 | + print('Loading ',filename,', processing line',i) |
| 106 | + |
| 107 | + if line[-1]=='\n': |
| 108 | + line = line[:-1] |
| 109 | + line = line.strip() |
| 110 | + if lower_case: |
| 111 | + line = line.lower() |
| 112 | + |
| 113 | + # You need to run nltk.download('punkt') for this to work: |
| 114 | + corpus += nltk.word_tokenize(line) |
| 115 | + |
| 116 | + i+=1 |
| 117 | + |
| 118 | + print('compute word encoder...') |
| 119 | + word_counter = defaultdict(int) |
| 120 | + |
| 121 | + for word in corpus: |
| 122 | + word_counter[word] += 1 |
| 123 | + |
| 124 | + word_counter = list(word_counter.items()) |
| 125 | + word_counter = [elem for elem in word_counter if elem[1] >= min_frequency] |
| 126 | + word_counter.sort(key=lambda x:x[1], reverse=True) |
| 127 | + |
| 128 | + word_encoder = defaultdict(int) |
| 129 | + |
| 130 | + for i,elem in enumerate(word_counter): |
| 131 | + word_encoder[elem[0]] = i |
| 132 | + |
| 133 | + print('done') |
| 134 | + |
| 135 | + return corpus, word_encoder |
| 136 | + |
| 137 | +def train(corpus, word_encoder, vocabulary_size, num_samples, steps): |
| 138 | + with tf.device('/cpu'): |
| 139 | + with tf.Session() as sess: |
| 140 | + embeddings, contexts, targets, optimizer, loss = build_graph(vocabulary_size, num_samples, |
| 141 | + FLAGS.embedding_size, FLAGS.learning_rate) |
| 142 | + |
| 143 | + ## summary ops |
| 144 | + timestamp = str(int(time.time())) |
| 145 | + train_summary_dir = os.path.join('./', 'w2v_summaries_' + timestamp) + '/' |
| 146 | + ensure_dir(train_summary_dir) |
| 147 | + print('Writing summaries and checkpoints to logdir:' + train_summary_dir) |
| 148 | + model_ckpt_file = os.path.join('./w2v_summaries_'+ timestamp + '/', 'model.ckpt') |
| 149 | + vocab_file = os.path.join(train_summary_dir, 'metadata.tsv') |
| 150 | + |
| 151 | + vocab_items = list(word_encoder.items()) |
| 152 | + vocab_items.sort(key=lambda x:x[1]) |
| 153 | + print(vocab_items[:100]) |
| 154 | + vocab_list = [elem[0] for elem in vocab_items if elem[1] > 0] |
| 155 | + |
| 156 | + with open(vocab_file, 'w') as vocab_file_out: |
| 157 | + vocab_file_out.write('<UNK>'+'\n') |
| 158 | + for word in vocab_list: |
| 159 | + vocab_file_out.write(word+'\n') |
| 160 | + |
| 161 | + loss_summary = tf.summary.scalar('loss', loss) |
| 162 | + config = projector.ProjectorConfig() |
| 163 | + embedding = config.embeddings.add() |
| 164 | + embedding.tensor_name = embeddings.name |
| 165 | + embedding.metadata_path = vocab_file |
| 166 | + train_summary_op = tf.summary.merge_all() |
| 167 | + |
| 168 | + summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) |
| 169 | + projector.visualize_embeddings(summary_writer, config) |
| 170 | + |
| 171 | + saver = tf.train.Saver(tf.global_variables()) |
| 172 | + |
| 173 | + ## initalize parameters |
| 174 | + sess.run(tf.global_variables_initializer()) |
| 175 | + |
| 176 | + losses = [] |
| 177 | + |
| 178 | + ## now do batched SGD training |
| 179 | + for current_step in range(steps): |
| 180 | + inputs, labels = generate_batch(corpus, batch_size=32, skip_gram=FLAGS.skip_gram) |
| 181 | + feed_dict = {contexts: inputs, targets: labels} |
| 182 | + _, cur_loss = sess.run([optimizer, loss], feed_dict=feed_dict) |
| 183 | + |
| 184 | + losses.append(cur_loss) |
| 185 | + |
| 186 | + if current_step % 100==0 and current_step != 0: |
| 187 | + summary_str = sess.run(train_summary_op, feed_dict=feed_dict) |
| 188 | + summary_writer.add_summary(summary_str, current_step) |
| 189 | + |
| 190 | + if current_step % 1000 == 0: |
| 191 | + print('step',current_step,'mean loss:', np.mean(np.asarray(losses))) |
| 192 | + saver.save(sess, model_ckpt_file, current_step) |
| 193 | + losses = [] |
| 194 | + |
| 195 | + embeddings_np = sess.run(embeddings) |
| 196 | + print('embedding matrix:', embeddings_np) |
| 197 | + |
| 198 | + # implement your neighboor search here |
| 199 | + |
| 200 | + |
| 201 | +if __name__ == "__main__": |
| 202 | + corpus, word_encoder = load_corpus(lower_case=FLAGS.lower_case, min_frequency=FLAGS.min_frequency) |
| 203 | + |
| 204 | + |
| 205 | + corpus_num = [word_encoder[word] for word in corpus] |
| 206 | + |
| 207 | + print('First few tokens of corpus:', corpus[:100]) |
| 208 | + print('First few tokens of corpus_num:', list(corpus_num[:100])) |
| 209 | + |
| 210 | + corpus_num = np.asarray(corpus_num) |
| 211 | + |
| 212 | + train(corpus_num, word_encoder, vocabulary_size=max(corpus_num)+1, num_samples=FLAGS.num_neg_samples , steps=FLAGS.steps) |
0 commit comments