Skip to content

Commit ddf32c9

Browse files
committed
重构
1 parent 0af6faa commit ddf32c9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+22176
-1924
lines changed

README.md

+51-461
Large diffs are not rendered by default.

TextClassification/DataPreprocess.py

+151
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
import json
2+
import numpy as np
3+
from keras.preprocessing.text import Tokenizer
4+
from keras.preprocessing.sequence import pad_sequences
5+
from gensim.models import word2vec
6+
import jieba
7+
import pickle
8+
9+
jieba.setLogLevel('WARN')
10+
11+
12+
class DataPreprocess():
13+
def __init__(self):
14+
self.texts_cut = None
15+
self.tokenizer = None
16+
self.tokenizer_fact = None
17+
18+
def cut_texts(self, texts=None, need_cut=True, word_len=1, savepath=None):
19+
'''
20+
Use jieba to cut texts
21+
:param texts:list of texts
22+
:param need_cut:whether need cut text
23+
:param word_len:min length of words to keep,in order to delete stop-words
24+
:param savepath:path to save word list in json file
25+
:return:
26+
'''
27+
if need_cut:
28+
if word_len > 1:
29+
texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts]
30+
else:
31+
texts_cut = [jieba.lcut(one_text) for one_text in texts]
32+
else:
33+
if word_len > 1:
34+
texts_cut = [[word for word in text if len(word) >= word_len] for text in texts]
35+
else:
36+
texts_cut = texts
37+
38+
if savepath is not None:
39+
with open(savepath, 'w') as f:
40+
json.dump(texts_cut, f)
41+
return texts_cut
42+
43+
def text2seq(self, texts_cut=None, tokenizer=None, tokenizer_savapah=None,
44+
num_words=2000, maxlen=30, batchsize=10000):
45+
'''
46+
文本转序列,用于神经网络的ebedding层输入。训练集过大全部转换会内存溢出,每次放10000个样本
47+
:param texts_cut: 分词后的文本列表
48+
:param tokenizer:转换字典,keras的一个方法
49+
:param tokenizer_savapah:字典保存路径
50+
:param num_words:字典保留的高频词数量
51+
:param maxlen:保留长度
52+
:param batchsize:每次参与提取的文档数
53+
:return:向量列表
54+
eg. ata_transform.text2seq(texts_cut=train_fact_cut,num_words=2000, maxlen=500)
55+
'''
56+
texts_cut_len = len(texts_cut)
57+
58+
if tokenizer is None:
59+
tokenizer = Tokenizer(num_words=num_words)
60+
n = 0
61+
# 分批训练
62+
while n < texts_cut_len:
63+
tokenizer.fit_on_texts(texts=texts_cut[n:n + batchsize])
64+
n += batchsize
65+
if n < texts_cut_len:
66+
print('tokenizer finish fit %d samples' % n)
67+
else:
68+
print('tokenizer finish fit %d samples' % texts_cut_len)
69+
self.tokenizer = tokenizer
70+
71+
if tokenizer_savapah:
72+
with open(tokenizer_savapah, mode='wb') as f:
73+
pickle.dump(tokenizer, f)
74+
75+
# 全部转为数字序列
76+
fact_seq = tokenizer.texts_to_sequences(texts=texts_cut)
77+
print('finish texts to sequences')
78+
79+
# 内存不够,删除
80+
del texts_cut
81+
82+
n = 0
83+
fact_pad_seq = []
84+
# 分批执行pad_sequences
85+
while n < texts_cut_len:
86+
fact_pad_seq += list(pad_sequences(fact_seq[n:n + 10000], maxlen=maxlen,
87+
padding='post', value=0, dtype='int'))
88+
n += 10000
89+
if n < texts_cut_len:
90+
print('finish pad sequences %d/%d' % (n, texts_cut_len))
91+
else:
92+
print('finish pad sequences %d/%d' % (texts_cut_len, texts_cut_len))
93+
return fact_pad_seq
94+
95+
def text2vec(self, texts_cut=None, model_word2vec=None,
96+
word2vec_savepath=None, word2vec_loadpath=None,
97+
sg=1, size=128, window=5, min_count=1):
98+
'''
99+
文本的词语序列转为词向量序列,可以用于机器学习或者深度学习
100+
:param texts_cut: 词语序列
101+
:param model_word2vec: word2vec的模型
102+
:param word2vec_savepath: word2vec保存路径
103+
:param word2vec_loadpath: word2vec导入路径
104+
:param sg: 0 CBOW,1 skip-gram
105+
:param size: the dimensionality of the feature vectors
106+
:param window: the maximum distance between the current and predicted word within a sentence
107+
:param min_count: ignore all words with total frequency lower than this
108+
:return:
109+
'''
110+
if model_word2vec is None:
111+
if word2vec_loadpath:
112+
model_word2vec = word2vec.Word2Vec.load(word2vec_loadpath)
113+
else:
114+
model_word2vec = word2vec.Word2Vec(texts_cut, sg=sg, size=size, window=window, min_count=min_count)
115+
if word2vec_savepath:
116+
model_word2vec.save(word2vec_savepath)
117+
118+
return [[model_word2vec[word] for word in text_cut if word in model_word2vec] for text_cut in texts_cut]
119+
120+
def creat_label_set(self, labels):
121+
'''
122+
获取标签集合,用于one-hot
123+
:param labels: 原始标签集
124+
:return:
125+
'''
126+
label_set = []
127+
for i in labels:
128+
label_set += i
129+
return np.array(list(set(label_set)))
130+
131+
def creat_label(self, label, label_set):
132+
'''
133+
构建标签one-hot
134+
:param label: 原始标签
135+
:param label_set: 标签集合
136+
:return: 标签one-hot形式的array
137+
eg. creat_label(label=data_valid_accusations[12], label_set=accusations_set)
138+
'''
139+
label_zero = np.zeros(len(label_set))
140+
label_zero[np.in1d(label_set, label)] = 1
141+
return label_zero
142+
143+
def creat_labels(self, labels=None, label_set=None):
144+
'''
145+
调用creat_label遍历标签列表生成one-hot二维数组
146+
:param label: 原始标签集
147+
:param label_set: 标签集合
148+
:return:
149+
'''
150+
labels_one_hot = list(map(lambda x: self.creat_label(label=x, label_set=label_set), labels))
151+
return labels_one_hot
+105
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from .DataPreprocess import DataPreprocess
2+
from .models import CNN, RNN
3+
import numpy as np
4+
5+
6+
class TextClassification():
7+
def __init__(self):
8+
pass
9+
10+
def fit(self, x=None, y=None, model=None,
11+
method='CNN', epochs=10, batchsize=256,
12+
x_need_preprocess=True, y_need_preprocess=True,
13+
tokenizer=None, num_words=2000, maxlen=30,
14+
vec_size=128, output_shape=None, output_type='multiple'):
15+
self.tokenizer = tokenizer
16+
self.num_words = num_words
17+
self.maxlen = maxlen
18+
self.vec_size = vec_size
19+
20+
# need process
21+
if x_need_preprocess:
22+
process = DataPreprocess()
23+
# cut texts
24+
x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None)
25+
# use average length
26+
if maxlen is None:
27+
maxlen = int(np.array([len(x) for i in x_cut]).mean())
28+
# texts to sequence
29+
x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer, tokenizer_savapah=None,
30+
num_words=num_words, maxlen=maxlen, batchsize=10000)
31+
x_seq = np.array(x_seq)
32+
x = x_seq
33+
self.num_words = num_words
34+
self.maxlen = maxlen
35+
self.tokenizer = process.tokenizer
36+
37+
if y_need_preprocess:
38+
process = DataPreprocess()
39+
label_set = process.creat_label_set(y)
40+
labels = process.creat_labels(labels=y, label_set=label_set)
41+
labels = np.array(labels)
42+
output_shape = labels.shape[1]
43+
y = labels
44+
self.output_shape = output_shape
45+
self.label_set=label_set
46+
47+
if model is None:
48+
if method == 'CNN':
49+
model = CNN(input_dim=num_words, input_length=maxlen,
50+
vec_size=vec_size, output_shape=output_shape,
51+
output_type=output_type)
52+
elif method == 'RNN':
53+
model = RNN(input_dim=num_words, input_length=maxlen,
54+
vec_size=vec_size, output_shape=output_shape,
55+
output_type=output_type)
56+
else:
57+
# maybe sklearn
58+
pass
59+
60+
model.fit(x=x, y=y, epochs=epochs, batch_size=batchsize)
61+
self.model = model
62+
63+
def predict(self, x=None, x_need_preprocess=True,
64+
tokenizer=None, num_words=None, maxlen=None):
65+
if x_need_preprocess:
66+
if tokenizer is not None:
67+
tokenizer = self.tokenizer
68+
if num_words is None:
69+
num_words = self.num_words
70+
if maxlen is None:
71+
maxlen = self.maxlen
72+
process = DataPreprocess()
73+
x_cut = process.cut_texts(texts=x, need_cut=True, word_len=2, savepath=None)
74+
x_seq = process.text2seq(texts_cut=x_cut, tokenizer=tokenizer,
75+
num_words=num_words, maxlen=maxlen, batchsize=10000)
76+
x = np.array(x_seq)
77+
78+
model = self.model
79+
y = model.predict(x=x)
80+
return y
81+
82+
def label2toptag(self, predictions, labelset):
83+
labels = []
84+
for prediction in predictions:
85+
label = labelset[prediction == prediction.max()]
86+
labels.append(label.tolist())
87+
return labels
88+
89+
def label2half(self, predictions, labelset):
90+
labels = []
91+
for prediction in predictions:
92+
label = labelset[prediction > 0.5]
93+
labels.append(label.tolist())
94+
return labels
95+
96+
def label2tag(self, predictions, labelset):
97+
labels1=self.label2toptag(predictions, labelset)
98+
labels2 = self.label2half(predictions, labelset)
99+
labels = []
100+
for i in range(len(predictions)):
101+
if len(labels2[i])==0:
102+
labels.append(labels1[i])
103+
else:
104+
labels.append(labels2[i])
105+
return labels

TextClassification/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .DataPreprocess import DataPreprocess
2+
from .TextClassification import TextClassification

0 commit comments

Comments
 (0)