|
| 1 | +import json |
| 2 | +import numpy as np |
| 3 | +from keras.preprocessing.text import Tokenizer |
| 4 | +from keras.preprocessing.sequence import pad_sequences |
| 5 | +from gensim.models import word2vec |
| 6 | +import jieba |
| 7 | +import pickle |
| 8 | + |
| 9 | +jieba.setLogLevel('WARN') |
| 10 | + |
| 11 | + |
| 12 | +class DataPreprocess(): |
| 13 | + def __init__(self): |
| 14 | + self.texts_cut = None |
| 15 | + self.tokenizer = None |
| 16 | + self.tokenizer_fact = None |
| 17 | + |
| 18 | + def cut_texts(self, texts=None, need_cut=True, word_len=1, savepath=None): |
| 19 | + ''' |
| 20 | + Use jieba to cut texts |
| 21 | + :param texts:list of texts |
| 22 | + :param need_cut:whether need cut text |
| 23 | + :param word_len:min length of words to keep,in order to delete stop-words |
| 24 | + :param savepath:path to save word list in json file |
| 25 | + :return: |
| 26 | + ''' |
| 27 | + if need_cut: |
| 28 | + if word_len > 1: |
| 29 | + texts_cut = [[word for word in jieba.lcut(text) if len(word) >= word_len] for text in texts] |
| 30 | + else: |
| 31 | + texts_cut = [jieba.lcut(one_text) for one_text in texts] |
| 32 | + else: |
| 33 | + if word_len > 1: |
| 34 | + texts_cut = [[word for word in text if len(word) >= word_len] for text in texts] |
| 35 | + else: |
| 36 | + texts_cut = texts |
| 37 | + |
| 38 | + if savepath is not None: |
| 39 | + with open(savepath, 'w') as f: |
| 40 | + json.dump(texts_cut, f) |
| 41 | + return texts_cut |
| 42 | + |
| 43 | + def text2seq(self, texts_cut=None, tokenizer=None, tokenizer_savapah=None, |
| 44 | + num_words=2000, maxlen=30, batchsize=10000): |
| 45 | + ''' |
| 46 | + 文本转序列,用于神经网络的ebedding层输入。训练集过大全部转换会内存溢出,每次放10000个样本 |
| 47 | + :param texts_cut: 分词后的文本列表 |
| 48 | + :param tokenizer:转换字典,keras的一个方法 |
| 49 | + :param tokenizer_savapah:字典保存路径 |
| 50 | + :param num_words:字典保留的高频词数量 |
| 51 | + :param maxlen:保留长度 |
| 52 | + :param batchsize:每次参与提取的文档数 |
| 53 | + :return:向量列表 |
| 54 | + eg. ata_transform.text2seq(texts_cut=train_fact_cut,num_words=2000, maxlen=500) |
| 55 | + ''' |
| 56 | + texts_cut_len = len(texts_cut) |
| 57 | + |
| 58 | + if tokenizer is None: |
| 59 | + tokenizer = Tokenizer(num_words=num_words) |
| 60 | + n = 0 |
| 61 | + # 分批训练 |
| 62 | + while n < texts_cut_len: |
| 63 | + tokenizer.fit_on_texts(texts=texts_cut[n:n + batchsize]) |
| 64 | + n += batchsize |
| 65 | + if n < texts_cut_len: |
| 66 | + print('tokenizer finish fit %d samples' % n) |
| 67 | + else: |
| 68 | + print('tokenizer finish fit %d samples' % texts_cut_len) |
| 69 | + self.tokenizer = tokenizer |
| 70 | + |
| 71 | + if tokenizer_savapah: |
| 72 | + with open(tokenizer_savapah, mode='wb') as f: |
| 73 | + pickle.dump(tokenizer, f) |
| 74 | + |
| 75 | + # 全部转为数字序列 |
| 76 | + fact_seq = tokenizer.texts_to_sequences(texts=texts_cut) |
| 77 | + print('finish texts to sequences') |
| 78 | + |
| 79 | + # 内存不够,删除 |
| 80 | + del texts_cut |
| 81 | + |
| 82 | + n = 0 |
| 83 | + fact_pad_seq = [] |
| 84 | + # 分批执行pad_sequences |
| 85 | + while n < texts_cut_len: |
| 86 | + fact_pad_seq += list(pad_sequences(fact_seq[n:n + 10000], maxlen=maxlen, |
| 87 | + padding='post', value=0, dtype='int')) |
| 88 | + n += 10000 |
| 89 | + if n < texts_cut_len: |
| 90 | + print('finish pad sequences %d/%d' % (n, texts_cut_len)) |
| 91 | + else: |
| 92 | + print('finish pad sequences %d/%d' % (texts_cut_len, texts_cut_len)) |
| 93 | + return fact_pad_seq |
| 94 | + |
| 95 | + def text2vec(self, texts_cut=None, model_word2vec=None, |
| 96 | + word2vec_savepath=None, word2vec_loadpath=None, |
| 97 | + sg=1, size=128, window=5, min_count=1): |
| 98 | + ''' |
| 99 | + 文本的词语序列转为词向量序列,可以用于机器学习或者深度学习 |
| 100 | + :param texts_cut: 词语序列 |
| 101 | + :param model_word2vec: word2vec的模型 |
| 102 | + :param word2vec_savepath: word2vec保存路径 |
| 103 | + :param word2vec_loadpath: word2vec导入路径 |
| 104 | + :param sg: 0 CBOW,1 skip-gram |
| 105 | + :param size: the dimensionality of the feature vectors |
| 106 | + :param window: the maximum distance between the current and predicted word within a sentence |
| 107 | + :param min_count: ignore all words with total frequency lower than this |
| 108 | + :return: |
| 109 | + ''' |
| 110 | + if model_word2vec is None: |
| 111 | + if word2vec_loadpath: |
| 112 | + model_word2vec = word2vec.Word2Vec.load(word2vec_loadpath) |
| 113 | + else: |
| 114 | + model_word2vec = word2vec.Word2Vec(texts_cut, sg=sg, size=size, window=window, min_count=min_count) |
| 115 | + if word2vec_savepath: |
| 116 | + model_word2vec.save(word2vec_savepath) |
| 117 | + |
| 118 | + return [[model_word2vec[word] for word in text_cut if word in model_word2vec] for text_cut in texts_cut] |
| 119 | + |
| 120 | + def creat_label_set(self, labels): |
| 121 | + ''' |
| 122 | + 获取标签集合,用于one-hot |
| 123 | + :param labels: 原始标签集 |
| 124 | + :return: |
| 125 | + ''' |
| 126 | + label_set = [] |
| 127 | + for i in labels: |
| 128 | + label_set += i |
| 129 | + return np.array(list(set(label_set))) |
| 130 | + |
| 131 | + def creat_label(self, label, label_set): |
| 132 | + ''' |
| 133 | + 构建标签one-hot |
| 134 | + :param label: 原始标签 |
| 135 | + :param label_set: 标签集合 |
| 136 | + :return: 标签one-hot形式的array |
| 137 | + eg. creat_label(label=data_valid_accusations[12], label_set=accusations_set) |
| 138 | + ''' |
| 139 | + label_zero = np.zeros(len(label_set)) |
| 140 | + label_zero[np.in1d(label_set, label)] = 1 |
| 141 | + return label_zero |
| 142 | + |
| 143 | + def creat_labels(self, labels=None, label_set=None): |
| 144 | + ''' |
| 145 | + 调用creat_label遍历标签列表生成one-hot二维数组 |
| 146 | + :param label: 原始标签集 |
| 147 | + :param label_set: 标签集合 |
| 148 | + :return: |
| 149 | + ''' |
| 150 | + labels_one_hot = list(map(lambda x: self.creat_label(label=x, label_set=label_set), labels)) |
| 151 | + return labels_one_hot |
0 commit comments