|
22 | 22 | },
|
23 | 23 | {
|
24 | 24 | "cell_type": "code",
|
25 |
| - "execution_count": 1, |
| 25 | + "execution_count": 23, |
26 | 26 | "metadata": {
|
27 | 27 | "collapsed": false
|
28 | 28 | },
|
|
46 | 46 | "name": "stderr",
|
47 | 47 | "output_type": "stream",
|
48 | 48 | "text": [
|
49 |
| - "Loading model cost 1.042 seconds.\n", |
| 49 | + "Loading model cost 1.214 seconds.\n", |
50 | 50 | "Prefix dict has been built succesfully.\n"
|
51 | 51 | ]
|
52 | 52 | },
|
|
63 | 63 | {
|
64 | 64 | "data": {
|
65 | 65 | "text/plain": [
|
66 |
| - "{'doc_title': ['南', '天', '信息', '管理层', '增持', '86', '万股'], 'doc_type': 'IT'}" |
| 66 | + "{'doc_content': ['本期',\n", |
| 67 | + " '节目',\n", |
| 68 | + " '内容',\n", |
| 69 | + " '介绍',\n", |
| 70 | + " '关注',\n", |
| 71 | + " '机动车',\n", |
| 72 | + " '驾驶证',\n", |
| 73 | + " '申领',\n", |
| 74 | + " '和',\n", |
| 75 | + " '使用',\n", |
| 76 | + " '规定',\n", |
| 77 | + " '搜狐',\n", |
| 78 | + " '汽车',\n", |
| 79 | + " '广播',\n", |
| 80 | + " '诚邀',\n", |
| 81 | + " '全国',\n", |
| 82 | + " '各地',\n", |
| 83 | + " '强势',\n", |
| 84 | + " '电台',\n", |
| 85 | + " '真情',\n", |
| 86 | + " '加盟',\n", |
| 87 | + " '携手',\n", |
| 88 | + " '打造',\n", |
| 89 | + " '中国',\n", |
| 90 | + " '汽车',\n", |
| 91 | + " '广播',\n", |
| 92 | + " '最强',\n", |
| 93 | + " '容',\n", |
| 94 | + " '把脉',\n", |
| 95 | + " '全球',\n", |
| 96 | + " '汽车产业',\n", |
| 97 | + " '风向标',\n", |
| 98 | + " '引领',\n", |
| 99 | + " '时尚',\n", |
| 100 | + " '汽车',\n", |
| 101 | + " '消费',\n", |
| 102 | + " '的',\n", |
| 103 | + " '参考书',\n", |
| 104 | + " '搜狐',\n", |
| 105 | + " '汽车',\n", |
| 106 | + " '广播',\n", |
| 107 | + " '车旅',\n", |
| 108 | + " '杂志',\n", |
| 109 | + " '服务',\n", |
| 110 | + " '我们',\n", |
| 111 | + " '的',\n", |
| 112 | + " '汽车',\n", |
| 113 | + " '生活',\n", |
| 114 | + " '加盟',\n", |
| 115 | + " '热线',\n", |
| 116 | + " '13381202220',\n", |
| 117 | + " '010',\n", |
| 118 | + " '62729907',\n", |
| 119 | + " '独家',\n", |
| 120 | + " '出品',\n", |
| 121 | + " '搜狐',\n", |
| 122 | + " '汽车',\n", |
| 123 | + " '事业部'],\n", |
| 124 | + " 'doc_title': ['搜狐', '汽车', '广播', '车旅', '杂志', '2012', '06', '20', '期'],\n", |
| 125 | + " 'doc_type': '汽车'}" |
67 | 126 | ]
|
68 | 127 | },
|
69 |
| - "execution_count": 1, |
| 128 | + "execution_count": 23, |
70 | 129 | "metadata": {},
|
71 | 130 | "output_type": "execute_result"
|
72 | 131 | }
|
|
97 | 156 | " return tokens\n",
|
98 | 157 | "\n",
|
99 | 158 | "# 对新闻标题进行分词,得到带分词的新闻数据\n",
|
100 |
| - "tokenSougouNews = featurelize(sougouNews, fields=['doc_title'], analyzer=Analyzer())\n", |
| 159 | + "tokenSougouNews = featurelize(sougouNews, fields=['doc_title', 'doc_content'], analyzer=Analyzer())\n", |
101 | 160 | "print('完成对新闻标题的分词')\n",
|
102 | 161 | "\n",
|
103 | 162 | "# 将分词后的结果dump到本地\n",
|
|
159 | 218 | "source": [
|
160 | 219 | "import pickle\n",
|
161 | 220 | "\n",
|
| 221 | + " \n", |
| 222 | + "with open('tokenSougouNews-test.pk', 'rb') as f:\n", |
| 223 | + " testData = pickle\n", |
162 | 224 | "with open('tokenSougouNews-train.pk', 'rb') as f:\n",
|
163 | 225 | " trainData = pickle.load(f)\n",
|
164 | 226 | "trainX = [dict(doc_title=' '.join(d['doc_title'])) for d in trainData]\n",
|
165 | 227 | "trainY = [d['doc_type'] for d in trainData]\n",
|
166 |
| - "print('train size=%d' % (len(trainX)))\n", |
167 |
| - " \n", |
168 |
| - "with open('tokenSougouNews-test.pk', 'rb') as f:\n", |
169 |
| - " testData = pickle.load(f)\n", |
| 228 | + "print('train size=%d' % (len(trainX))).load(f)\n", |
170 | 229 | "testX = [dict(doc_title=' '.join(d['doc_title'])) for d in testData]\n",
|
171 | 230 | "testY = [d['doc_type'] for d in testData]\n",
|
172 | 231 | "print('test size=%d' % (len(testX)))"
|
|
227 | 286 | },
|
228 | 287 | {
|
229 | 288 | "cell_type": "code",
|
230 |
| - "execution_count": 11, |
| 289 | + "execution_count": 16, |
231 | 290 | "metadata": {
|
232 | 291 | "collapsed": false
|
233 | 292 | },
|
|
244 | 303 | "name": "stdout",
|
245 | 304 | "output_type": "stream",
|
246 | 305 | "text": [
|
| 306 | + "tfidf+lr: trainAcc=0.913848, testAcc=0.869774\n", |
247 | 307 | "tfidf+multiNB: trainAcc=0.867886, testAcc=0.821235\n",
|
248 | 308 | "tfidf+svm: trainAcc=0.981018, testAcc=0.895906\n"
|
249 | 309 | ]
|
|
252 | 312 | "source": [
|
253 | 313 | "from sklearn.naive_bayes import MultinomialNB, BernoulliNB\n",
|
254 | 314 | "from sklearn.svm import LinearSVC\n",
|
| 315 | + "from sklearn.linear_model import LogisticRegression\n", |
255 | 316 | "from sklearn.pipeline import Pipeline\n",
|
256 | 317 | "from sklearn.metrics import accuracy_score\n",
|
257 | 318 | "\n",
|
| 319 | + "# tfidf + lr\n", |
| 320 | + "lrClf = Pipeline([('tfidfVectorizor', TfidfVectorizor(['doc_title'])),\n", |
| 321 | + " ('lr', LogisticRegression())])\n", |
| 322 | + "lrClf.fit(trainX, trainY)\n", |
| 323 | + "\n", |
| 324 | + "trainAcc = accuracy_score(trainY, lrClf.predict(trainX))\n", |
| 325 | + "testAcc = accuracy_score(testY, lrClf.predict(testX))\n", |
| 326 | + "print('tfidf+lr: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))\n", |
| 327 | + "\n", |
| 328 | + "# tfidf + nb\n", |
258 | 329 | "nbClf = Pipeline([('tfidfVectorizor', TfidfVectorizor(['doc_title'])),\n",
|
259 | 330 | " ('multinomialNB', MultinomialNB())])\n",
|
260 | 331 | "nbClf.fit(trainX, trainY)\n",
|
261 | 332 | "\n",
|
262 |
| - "# 计算误差\n", |
263 | 333 | "trainAcc = accuracy_score(trainY, nbClf.predict(trainX))\n",
|
264 | 334 | "testAcc = accuracy_score(testY, nbClf.predict(testX))\n",
|
265 | 335 | "print('tfidf+multiNB: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))\n",
|
266 | 336 | "\n",
|
| 337 | + "# tfidf + svm\n", |
267 | 338 | "svmClf = Pipeline([('tfidfVectorizor', TfidfVectorizor(['doc_title'])),\n",
|
268 | 339 | " ('svm', LinearSVC())])\n",
|
269 | 340 | "svmClf.fit(trainX, trainY)\n",
|
270 | 341 | "\n",
|
271 |
| - "# 计算误差\n", |
272 | 342 | "trainAcc = accuracy_score(trainY, svmClf.predict(trainX))\n",
|
273 | 343 | "testAcc = accuracy_score(testY, svmClf.predict(testX))\n",
|
274 | 344 | "print('tfidf+svm: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))"
|
|
285 | 355 | },
|
286 | 356 | {
|
287 | 357 | "cell_type": "code",
|
288 |
| - "execution_count": 12, |
| 358 | + "execution_count": 17, |
289 | 359 | "metadata": {
|
290 | 360 | "collapsed": false
|
291 | 361 | },
|
|
300 | 370 | {
|
301 | 371 | "data": {
|
302 | 372 | "text/plain": [
|
303 |
| - "<__main__.Doc2VecVectorizor at 0x1fd4fd75710>" |
| 373 | + "<__main__.Doc2VecVectorizor at 0x1fd5007cf98>" |
304 | 374 | ]
|
305 | 375 | },
|
306 |
| - "execution_count": 12, |
| 376 | + "execution_count": 17, |
307 | 377 | "metadata": {},
|
308 | 378 | "output_type": "execute_result"
|
309 | 379 | }
|
|
312 | 382 | "from gensim.models import Word2Vec\n",
|
313 | 383 | "\n",
|
314 | 384 | "class Doc2VecVectorizor(object):\n",
|
315 |
| - " def __init__(self, fields, size=200, window=3, min_count=1):\n", |
| 385 | + " def __init__(self, fields, size=100, window=3, min_count=1):\n", |
316 | 386 | " self.fields = fields\n",
|
317 | 387 | " self.size = size\n",
|
318 | 388 | " self.window = window\n",
|
|
352 | 422 | },
|
353 | 423 | {
|
354 | 424 | "cell_type": "code",
|
355 |
| - "execution_count": 13, |
| 425 | + "execution_count": 18, |
356 | 426 | "metadata": {
|
357 | 427 | "collapsed": false
|
358 | 428 | },
|
359 | 429 | "outputs": [
|
360 | 430 | {
|
361 | 431 | "data": {
|
362 | 432 | "text/plain": [
|
363 |
| - "[('老年人', 0.974733293056488),\n", |
364 |
| - " ('日内瓦', 0.9729659557342529),\n", |
365 |
| - " ('国际足球', 0.9727454781532288),\n", |
366 |
| - " ('专访', 0.9721158146858215),\n", |
367 |
| - " ('搜狐', 0.9709295034408569),\n", |
368 |
| - " ('第九届', 0.9708148241043091),\n", |
369 |
| - " ('舞蹈节', 0.9674550294876099),\n", |
370 |
| - " ('文化周', 0.9654016494750977),\n", |
371 |
| - " ('日程安排', 0.9652378559112549),\n", |
372 |
| - " ('作文题', 0.9637157320976257)]" |
| 433 | + "[('舞蹈节', 0.9734185934066772),\n", |
| 434 | + " ('专访', 0.9699808955192566),\n", |
| 435 | + " ('老年人', 0.9686485528945923),\n", |
| 436 | + " ('日内瓦', 0.9671200513839722),\n", |
| 437 | + " ('搜狐', 0.9666953086853027),\n", |
| 438 | + " ('看车', 0.963032603263855),\n", |
| 439 | + " ('国际足球', 0.9596318006515503),\n", |
| 440 | + " ('广汽传祺', 0.9582968950271606),\n", |
| 441 | + " ('篮联', 0.9582201242446899),\n", |
| 442 | + " ('海河', 0.9577779173851013)]" |
373 | 443 | ]
|
374 | 444 | },
|
375 |
| - "execution_count": 13, |
| 445 | + "execution_count": 18, |
376 | 446 | "metadata": {},
|
377 | 447 | "output_type": "execute_result"
|
378 | 448 | }
|
|
381 | 451 | "doc2vec.word2vec.wv.similar_by_word(word='体育', topn=10)"
|
382 | 452 | ]
|
383 | 453 | },
|
| 454 | + { |
| 455 | + "cell_type": "code", |
| 456 | + "execution_count": 20, |
| 457 | + "metadata": { |
| 458 | + "collapsed": false |
| 459 | + }, |
| 460 | + "outputs": [ |
| 461 | + { |
| 462 | + "data": { |
| 463 | + "text/plain": [ |
| 464 | + "100" |
| 465 | + ] |
| 466 | + }, |
| 467 | + "execution_count": 20, |
| 468 | + "metadata": {}, |
| 469 | + "output_type": "execute_result" |
| 470 | + } |
| 471 | + ], |
| 472 | + "source": [ |
| 473 | + "doc2vec.word2vec.vector_size" |
| 474 | + ] |
| 475 | + }, |
384 | 476 | {
|
385 | 477 | "cell_type": "markdown",
|
386 | 478 | "metadata": {},
|
|
390 | 482 | },
|
391 | 483 | {
|
392 | 484 | "cell_type": "code",
|
393 |
| - "execution_count": 15, |
| 485 | + "execution_count": 19, |
394 | 486 | "metadata": {
|
395 | 487 | "collapsed": false
|
396 | 488 | },
|
|
408 | 500 | "name": "stdout",
|
409 | 501 | "output_type": "stream",
|
410 | 502 | "text": [
|
411 |
| - "doc2vec+svm: trainAcc=0.706894, testAcc=0.709253\n" |
| 503 | + "doc2vec+svm: trainAcc=0.705841, testAcc=0.708672\n" |
412 | 504 | ]
|
413 | 505 | }
|
414 | 506 | ],
|
|
424 | 516 | "testAcc = accuracy_score(testY, svmClf.predict(testX))\n",
|
425 | 517 | "print('doc2vec+svm: trainAcc=%f, testAcc=%f' % (trainAcc, testAcc))"
|
426 | 518 | ]
|
| 519 | + }, |
| 520 | + { |
| 521 | + "cell_type": "markdown", |
| 522 | + "metadata": {}, |
| 523 | + "source": [ |
| 524 | + "### tf-idf加权的word2vec + classification\n", |
| 525 | + "#### tf-idf加权的word2vec" |
| 526 | + ] |
| 527 | + }, |
| 528 | + { |
| 529 | + "cell_type": "code", |
| 530 | + "execution_count": null, |
| 531 | + "metadata": { |
| 532 | + "collapsed": true |
| 533 | + }, |
| 534 | + "outputs": [], |
| 535 | + "source": [ |
| 536 | + "from gensim.models import Word2Vec\n", |
| 537 | + "\n", |
| 538 | + "class Doc2VecVectorizor(object):\n", |
| 539 | + " def __init__(self, tfidfVectorizor, word2vecVectorizor, fields):\n", |
| 540 | + " self.tfidfVectorizor = tfidfVectorizor\n", |
| 541 | + " self.word2vecVectorizor = word2vecVectorizor\n", |
| 542 | + " self.fields = fields\n", |
| 543 | + " \n", |
| 544 | + " def fit(self, X, y=None):\n", |
| 545 | + " return self\n", |
| 546 | + " \n", |
| 547 | + " def transform(self, X):\n", |
| 548 | + " \"\"\"\n", |
| 549 | + " 计算文档的特征向量\n", |
| 550 | + " 1. 对每个属性,计算每个词的tfidf-vector和word-vector,然后将所有词的两个vector的加权平均向量作为该属性的vector\n", |
| 551 | + " 2. 所有属性的vector,flatten为一个宽vector,作为该文档的特征向量\n", |
| 552 | + " \"\"\"\n", |
| 553 | + " return np.array([self.__doc2vec(x) for x in X])\n", |
| 554 | + " \n", |
| 555 | + " def __sentence2vec(self, sentence):\n", |
| 556 | + " if len(sentence.strip()) == 0:\n", |
| 557 | + " return np.zeros(self.size)\n", |
| 558 | + " vectors = [self.word2vecVectorizor[word]*self.tfidfVectorizor.transform() \n", |
| 559 | + " if word in self.word2vecVectorizor else np.zeros(self.size) \n", |
| 560 | + " for word in sentence.split()]\n", |
| 561 | + " return np.mean(vectors, axis=0)\n", |
| 562 | + " \n", |
| 563 | + " def __doc2vec(self, doc):\n", |
| 564 | + " vectors = np.array([self.__sentence2vec(doc[field]) for field in self.fields])\n", |
| 565 | + " return vectors.flatten()\n", |
| 566 | + " \n", |
| 567 | + "doc2vec = Doc2VecVectorizor(fields=['doc_title'])\n", |
| 568 | + "doc2vec.fit(trainX)" |
| 569 | + ] |
427 | 570 | }
|
428 | 571 | ],
|
429 | 572 | "metadata": {
|
|
0 commit comments