Skip to content

Commit 5cf03c0

Browse files
unknownunknown
unknown
authored and
unknown
committed
WE filters
1 parent 533a339 commit 5cf03c0

File tree

6 files changed

+1082
-0
lines changed

6 files changed

+1082
-0
lines changed

filters/WE_Average/WE_Average.py

+185
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# sys.path.append(os.getcwd() + '/..') # Uncomment for standalone running
2+
from abstract_filter import *
3+
from collections import Counter
4+
from scipy.sparse import lil_matrix
5+
from scipy.spatial.distance import cosine
6+
from gensim.matutils import Sparse2Corpus
7+
from gensim.models import lsimodel
8+
import numpy as np
9+
import os.path
10+
import math
11+
# from sklearn.decomposition import TruncatedSVD as SVD
12+
13+
14+
class WE_Average(AbstractFilter):
15+
def __init__(self):
16+
self.var_mult = 1.0
17+
18+
self.src_language = ""
19+
self.trg_language = ""
20+
21+
self.min_count = 3
22+
self.num_of_features = 100
23+
self.thresh = 0.65
24+
25+
self.all_words = []
26+
self.vocab = None
27+
self.vectors = None
28+
self.number_of_tus = 0
29+
30+
self.model_file_name = "models/vectors_bg_model_50k"
31+
self.dict_file_name = "models/dict_50k"
32+
33+
self.n = 0.0
34+
self.sum = 0.0
35+
self.sum_sq = 0.0
36+
37+
self.mean = 0.0
38+
self.var = 0.0
39+
40+
def initialize(self, source_language, target_language):
41+
self.num_of_scans = 3
42+
self.src_language = source_language
43+
self.trg_language = target_language
44+
45+
if os.path.isfile(self.model_file_name):
46+
print "Loading from file ..."
47+
self.num_of_scans = 1
48+
49+
lsi = lsimodel.LsiModel.load(self.model_file_name)
50+
self.vectors = lsi.projection.u
51+
52+
self.all_words = {}
53+
f = open(self.dict_file_name, "rb")
54+
55+
for l in f:
56+
l = l.strip().split("\t")
57+
58+
self.all_words[l[0]] = int(l[1])
59+
f.close()
60+
61+
def finalize(self):
62+
if self.num_of_scans == 1:
63+
print "Loaded the model from file."
64+
else:
65+
print "Performing SVD..."
66+
67+
# svd = SVD(n_components=self.num_of_features, random_state=42)
68+
# x = svd.fit_transform(self.vectors)
69+
# self.vectors = x
70+
71+
x = Sparse2Corpus(self.vectors)
72+
lsi = lsimodel.LsiModel(corpus=x, id2word=None, num_topics=self.num_of_features)
73+
lsi.save(self.model_file_name)
74+
self.vectors = lsi.projection.u
75+
76+
print "done."
77+
78+
if self.n <= 1:
79+
self.n = 2.0
80+
self.mean = self.sum / self.n
81+
self.var = (self.sum_sq - (self.sum * self.sum) / self.n) / (self.n - 1)
82+
self.var = math.sqrt(self.var)
83+
84+
def process_tu(self, tu, num_of_finished_scans):
85+
if (num_of_finished_scans == 0 and self.num_of_scans == 1) or num_of_finished_scans == 2:
86+
if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
87+
return
88+
89+
src_vectors = []
90+
for w in tu.src_tokens:
91+
if w in self.all_words:
92+
index = self.all_words[w]
93+
src_vectors.append(self.vectors[index])
94+
95+
if len(src_vectors) == 0:
96+
return
97+
src_rep = np.median(src_vectors, axis=0)
98+
99+
trg_vectors = []
100+
for w in tu.trg_tokens:
101+
if w in self.all_words:
102+
index = self.all_words[w]
103+
trg_vectors.append(self.vectors[index])
104+
105+
if len(trg_vectors) == 0:
106+
return
107+
trg_rep = np.median(trg_vectors, axis=0)
108+
109+
distance = cosine(src_rep, trg_rep)
110+
111+
self.n += 1
112+
self.sum += distance
113+
self.sum_sq += distance * distance
114+
115+
elif num_of_finished_scans == 0:
116+
self.all_words += tu.src_tokens
117+
self.all_words += tu.trg_tokens
118+
self.number_of_tus += 1
119+
else:
120+
for w in tu.src_tokens + tu.trg_tokens:
121+
if w in self.all_words:
122+
self.vectors[self.all_words[w], self.number_of_tus] = 1
123+
124+
self.number_of_tus += 1
125+
126+
def do_after_a_full_scan(self, num_of_finished_scans):
127+
if num_of_finished_scans == 1 and self.num_of_scans == 3:
128+
self.vocab = Counter(self.all_words)
129+
130+
self.all_words = {}
131+
for word in self.vocab:
132+
if self.vocab[word] >= self.min_count:
133+
# self.all_words.append(word)
134+
self.all_words[word] = len(self.all_words)
135+
136+
# if self.num_of_scans == 2:
137+
self.vectors = lil_matrix((len(self.all_words), self.number_of_tus), dtype=np.int8)
138+
139+
print "-#-#-#-#-#-#-#-#-#-#-#-"
140+
print "size of vocab:", len(self.vocab)
141+
print "size of common words:", len(self.all_words)
142+
print "number of TUs:", self.number_of_tus
143+
self.number_of_tus = 0
144+
145+
f = open(self.dict_file_name, "wb")
146+
147+
for w in self.all_words:
148+
f.write(w + "\t" + str(self.all_words[w]) + "\n")
149+
f.close()
150+
else:
151+
print "-#-#-#-#-#-#-#-#-#-#-#-"
152+
153+
#
154+
def decide(self, tu):
155+
if len(tu.src_phrase) == 0 or len(tu.trg_phrase) == 0:
156+
return 'reject'
157+
158+
src_vectors = []
159+
for w in tu.src_tokens:
160+
if w in self.all_words:
161+
index = self.all_words[w]
162+
src_vectors.append(self.vectors[index])
163+
164+
if len(src_vectors) == 0:
165+
return 'neutral'
166+
src_rep = np.sum(src_vectors, axis=0)
167+
168+
trg_vectors = []
169+
for w in tu.trg_tokens:
170+
if w in self.all_words:
171+
index = self.all_words[w]
172+
trg_vectors.append(self.vectors[index])
173+
174+
if len(trg_vectors) == 0:
175+
return 'neutral'
176+
trg_rep = np.sum(trg_vectors, axis=0)
177+
178+
distance = cosine(src_rep, trg_rep)
179+
180+
distance -= self.mean
181+
distance = math.fabs(distance)
182+
183+
if distance <= self.var_mult * self.var:
184+
return 'accept'
185+
return 'reject'

0 commit comments

Comments
 (0)