forked from nlx-group/LX-DSemVectors
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tools.py
executable file
·64 lines (48 loc) · 1.91 KB
/
tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/python
# -*- coding: utf-8 -*-
import gensim
import logging
import numpy as np
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
# load vanilla model
vanilla_model = "./models/vanilla/wikipedia.vanilla.w2v"
model = gensim.models.Word2Vec.load(vanilla_model)
model.init_sims(replace=True)
# algebric operations with words
print(model.most_similar(positive=["mulher", "rei"],
negative=["homem"]))
# most similar using multiplicative combination
print(model.most_similar_cosmul(positive=["mulher", "rei"],
negative=["homem"]))
# out of context word
print(model.doesnt_match("Portugal Espanha Alemanha Pacífico".split()))
# cosine similarity
print(model.similarity("homem", "mulher"))
# cosine similarity between two set of words
print(model.n_similarity(["Portugal", "português"],
["lisboa", "bacalhau"]))
# print word vector
print(model["palavra"])
# model size (vocabulary, vector size)
print(model.syn0.shape)
# convert vocabulary to aset
vocab = set(model.index2word)
# word in set?
print("vetusto" in vocab)
print("australopitecos" in vocab)
print("vocabulário" in model.vocab)
# find word from vector
def find_nearest(model, vector, K=5):
square = np.square(model.syn0norm - vector)
idx = np.sum(square, axis=1).argsort()[:K]
return map(lambda x: model.index2word[x], idx)
print(find_nearest(model, model["palavra"]))
# or
print(model.most_similar(positive=[model["palavra"]], topn=5))
# find antonyms
print(model.most_similar(positive=['bom', 'triste'],
negative=['mau']))
# export to word2vec format
# model.save_word2vec_format("word2vec.format")