forked from AimeeLee77/senti_analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4_getwordvecs.py
76 lines (62 loc) · 2.31 KB
/
4_getwordvecs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#从词向量模型中提取文本特征向量
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')# 忽略警告
import logging
import os.path
import codecs,sys
import numpy as np
import pandas as pd
import gensim
# 返回特征词向量
def getWordVecs(wordList,model):
vecs = []
for word in wordList:
word = word.replace('\n','')
#print word
try:
vecs.append(model[word])
except KeyError:
continue
return np.array(vecs, dtype='float')
# 构建文档词向量
def buildVecs(filename,model):
fileVecs = []
with codecs.open(filename, 'rb', encoding='utf-8') as contents:
for line in contents:
logger.info("Start line: " + line)
wordList = line.split(' ')
vecs = getWordVecs(wordList,model)
#print vecs
#sys.exit()
# for each sentence, the mean vector of all its vectors is used to represent this sentence
if len(vecs) >0:
vecsArray = sum(np.array(vecs))/len(vecs) # mean
#print vecsArray
#sys.exit()
fileVecs.append(vecsArray)
return fileVecs
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# load word2vec model
fdir = '/Users/sy/Desktop/pyRoot/SentimentAnalysis/'
inp = fdir + 'wiki.zh.text.vector'
model = gensim.models.KeyedVectors.load_word2vec_format(inp, binary=False)
posInput = buildVecs(fdir + '2000_pos_cut_stopword.txt',model)
negInput = buildVecs(fdir + '2000_neg_cut_stopword.txt',model)
# use 1 for positive sentiment, 0 for negative
Y = np.concatenate((np.ones(len(posInput)), np.zeros(len(negInput))))
X = posInput[:]
for neg in negInput:
X.append(neg)
X = np.array(X)
# write in file
df_x = pd.DataFrame(X)
df_y = pd.DataFrame(Y)
data = pd.concat([df_y,df_x],axis = 1)
#print data
data.to_csv(fdir + '2000_data.csv')