-
Notifications
You must be signed in to change notification settings - Fork 1
/
bow.py
73 lines (60 loc) · 2.21 KB
/
bow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
''' BoW encoder '''
import io
import logging as log
import numpy as np
import pdb
def get_word_dict(sentences, tokenize=True):
''' From sentences create vocab of words '''
word_dict = {}
if tokenize:
from nltk.tokenize import word_tokenize
else:
word_tokenize = str.split
tokenized_sents = [word_tokenize(s.lower()) for s in sentences]
for sent in tokenized_sents:
for word in sent:
if word not in word_dict:
word_dict[word] = ''
# word_dict['<s>'] = ''
# word_dict['</s>'] = ''
return tokenized_sents, word_dict
def get_vecs(sentences, word_vec, dim):
''' Create BoW representations for sentences using word_vecs '''
bow_vec = {}
for sent in sentences:
key = []
vec = np.zeros(dim) # initialize w/ zeros
for word in sent:
if word not in word_vec: continue
key.append(word)
single_wordvec = np.array(word_vec[word])
vec += single_wordvec
bow_vec[' '.join(key)] = vec / len(sent)
return bow_vec
def get_glove(vocab, glove_path):
''' Load vectors for words in word_dict from glove_path '''
word_vecs = {}
dim = None
with io.open(glove_path) as f:
for line in f:
word, vec = line.split(' ', 1)
if word in vocab:
# log.info('word found: %s', word)
word_vecs[word] = np.fromstring(vec, sep=' ')
if dim is None:
dim = len(word_vecs[word])
else:
assert len(word_vecs[word]) == dim
log.info('Found word vectors')
log.info(word_vecs.keys())
log.info('Found %d/%d words with glove vectors', len(word_vecs), len(vocab))
return word_vecs, dim
def encode(sentences, glove_path, tokenize=True):
''' Encode sentences into BoW representation '''
sents, vocab = get_word_dict(sentences, tokenize=tokenize)
word_vecs, dim = get_glove(vocab, glove_path)
bow_word_vecs = get_vecs(sents, word_vecs, dim)
log.info('Vocab size : %d', len(word_vecs))
log.info('Origin No of Sentences: %d', len(sentences))
log.info('Encoded No of Sentences : %d', len(bow_word_vecs))
return bow_word_vecs