-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw2v.py
226 lines (188 loc) · 7.51 KB
/
w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
'''
An attempted program to perform word2vec in Python3 with the help of gensim
Step 1 : Fetch list of files from the directory given / If empty look for a directory named data
Step 2 : Look for CSV files and in those files look for column name "content"
Step 3 : Parse Article to Sentences
Step 4 : Split Sentences into words where phases are taken care of also make entire data lowercase
Step 5 : Train Word2Vec with as required params
Step 6 : Save embeddings
Step 7 : Reduce the number of dimensions
Step 8 : Plot the Graph of the vector we got after Dimensionality reduction
Step 9 : Perform the operation you need.
'''
import codecs
import glob
import logging
import os
import pprint as pp
import re
import nltk
from gensim.models import Word2Vec, Phrases
from gensim.models.phrases import Phraser
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from multiprocessing import Lock, Process, Pool
import multiprocessing
from nltk.corpus import stopwords
from itertools import islice
lock = Lock()
phrases = Phrases(min_count=5, threshold=10, delimiter=b'_', progress_per=5000)
tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
cachedStopWords = stopwords.words("english")
phraser = ""
article2vec = Word2Vec()
class W2V:
NUMBER_OF_PROCESSOR = 8
original_location = ""
tokenizer = ""
sentences = []
raw_sentences = []
corpus_raw = u""
pool = ""
phrases = ""
# Main function
def __init__(self, location):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
self.read_files(location)
# Function which reads file in a given directory and returns a list of csv files
def read_files(self, location):
self.original_location = os.curdir
os.chdir("./" + location)
list_of_files = glob.glob("*.csv")
# for file in list_of_files:
print(list_of_files)
for fi in list_of_files:
self.fetch_data_from_file(fi)
# Function to fetch strings content column of all CSV and put all the articles together
def fetch_data_from_file(self, file_name):
print("Current file being processed is " + file_name)
df = pd.read_csv(file_name)
list_of_article = df['content']
print("Length : " + str(len(list_of_article)))
lol = self.lol(list_of_article, self.NUMBER_OF_PROCESSOR)
self.pool = Pool(self.NUMBER_OF_PROCESSOR)
# Get list of String of text content
chunk_corpus_list = list(self.pool.map(self.add_content_to_corpus, lol))
for chunks in chunk_corpus_list:
self.corpus_raw += chunks
self.corpus_raw = self.corpus_raw.encode("utf-8")
print("String length of corpus : " + str(len(self.corpus_raw)))
# Generate sentences from the string generated by worker pool before
all_sentences = self.pool.map(self.token_maker, chunk_corpus_list)
for i in all_sentences:
for j in i:
self.raw_sentences.append(j)
print(self.raw_sentences[0])
print("Number of Sentences are : " + str(len(self.raw_sentences)))
self.pool.map(self.phrases_generation, all_sentences)
global phraser
phraser = Phraser(phrases)
pool_of_words = self.pool.map(self.word_sentence, all_sentences)
for i in pool_of_words:
for j in i:
self.sentences.append(j)
# Dimensionality of the resulting word vectors.
# more dimensions, more computationally expensive to train
# but also more accurate
# more dimensions = more generalized
num_features = 300
# Minimum word count threshold.
min_word_count = 3
# Number of threads to run in parallel.
# more workers, faster we train
num_workers = multiprocessing.cpu_count()
# Context window length.
context_size = 7
# Down-sample setting for frequent words.
# 0 - 1e-5 is good for this
downsampling = 1e-3
# Seed for the RNG, to make the results reproducible.
# random number generator
# deterministic, good for debugging
seed = 1
global article2vec
article2vec = Word2Vec(
sg=1,
seed=seed,
workers=num_workers,
size=num_features,
min_count=min_word_count,
window=context_size,
sample=downsampling
)
article2vec.init_sims(replace=True)
article2vec.build_vocab(self.sentences)
print("Words similar to Trump : ")
print(article2vec.wv.most_similar("trump", topn=15))
print("Words similar to Obama : ")
print(article2vec.wv.most_similar("obama", topn=15))
# all_word_vectors_matrix = article2vec.wv.vectors
# print(article2vec.wv.vectors[0])
# list_of_emb = self.chunk(article2vec.wv.vectors, self.NUMBER_OF_PROCESSOR)
# df_list = self.pool.map(self.dimention_reduction, list_of_emb)
# points = pd.DataFrame(columns=["word", "x", "y"])
# for item in df_list:
# points.append(item)
#
# print(points.head(10))
# sns.set_context("poster")
# points.plot.scatter("x", "y", s=10, figsize=(20, 12))
self.pool.close()
self.pool.terminate()
print("Word2Vec vocabulary length:", len(article2vec.wv.vocab))
# Makes n small lists out of one big list and return list of small lists
@staticmethod
def lol(list_of_article, size):
return list(list_of_article[i::size] for i in range(size))
# Combines all the words in a given list of articles and returns a huge string
@staticmethod
def add_content_to_corpus(list_of_article):
string_of_article = u""
for article in list_of_article:
curr = ' '.join([word for word in article.split() if word not in cachedStopWords]).lower()
curr = re.sub(r'\b\w{1,1}\b', '', curr)
string_of_article += curr.replace("‘", '').replace("’", '').replace("'", '')
return string_of_article
@staticmethod
def token_maker(data):
return nltk.data.load('tokenizers/punkt/english.pickle').tokenize(data)
@staticmethod
def phrases_generation(data):
data_new = []
for i in data:
data_new.append(sentences_to_words(i))
phrases.add_vocab(data_new)
@staticmethod
def word_sentence(data):
global phraser
ret = []
for items in data:
if len(items) > 0:
ret.append(phrases[sentences_to_words(items)])
return ret
@staticmethod
def chunk(it, size):
it = iter(it)
return list(iter(lambda: list(islice(it, size)), []))
@staticmethod
def dimention_reduction(all_word_vectors_matrix):
# TODO: Use PCA or LDA first to narrow down the dimension and then go with TSNE?
all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
points = pd.DataFrame(
[
(word, coords[0], coords[1])
for word, coords in [(word, all_word_vectors_matrix_2d[article2vec.wv.vocab[word].index])
for word in article2vec.wv.vocab]
],
columns=["word", "x", "y"]
)
return points
def sentences_to_words(sentence):
clean = re.sub("[^a-zA-Z]", " ", sentence)
words = clean.split()
return words
W2V("all-the-news")