forked from danan0755/poetry
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpoetspider.py
143 lines (108 loc) · 4.12 KB
/
poetspider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#coding=utf-8
import urllib2
import bs4
from bs4 import BeautifulSoup
import time
starttime = time.clock()
site = "http://sou-yun.com/PoemIndex.aspx?dynasty=Tang"
requests = urllib2.Request(site)
requests.add_header('User-Agent',"HeJingjing's Browser")
response = urllib2.urlopen(requests)
site_page = response.read().decode('utf-8')
soup = BeautifulSoup(site_page, 'html.parser')
poemlist = []
tags = soup('a')
for tag in tags:
address = tag.get('href', None)
if '/PoemIndex.aspx?dynasty=Tang&author' in address:
poemlist.append(address) #获取所有诗人的网址列表
poemlists = list(set(poemlist))
#print poemlist
#上一行仅供测试用,正式代码中请注释掉
def view_poet(url):
url = 'http://sou-yun.com'+ url
req = urllib2.Request(url)
req.add_header('User-Agent',"HeJingjing's Browser")
response = urllib2.urlopen(req)
response.encoding='utf-8'
html = response.read()#,decode('utf-8')
#print html
poemsoup = BeautifulSoup(html, 'html.parser')
return poemsoup
fh = open('poems.txt','a')
#poemlists = ['/PoemIndex.aspx?dynasty=Tang&author=李白']
#本句测试用
poemcollection = []
n=0
person= 0
for poemlist in poemlists:
checkpoint1 = time.clock()
page = 0
while 1 :
html = view_poet(poemlist+'&type=All&page='+str(page))
#print page
try:
if '下一页' in str(html):
page = page + 1
#print html
poems = html.find_all('p')
#print poems #通过<p>匹配诗文内容
for i in poems:
poem = str(i)
if '<!-- -->' in poem:
#print poem
fmt = BeautifulSoup(poem,'lxml')
#print fmt
result = fmt.get_text()
final = result.split()
for sentence in final:
#print type(sentence)
fh.write(sentence.encode('utf-8')+'\n\n')
n = n + 1
continue
'''
下面是旧版代码,已经删除
poem = poem.replace('<','')
poem = poem.replace('-','')
poem = poem.replace(' ','')
poem = poem.replace('!','')
poem = poem.replace('>','')
#格式化诗词
poemcollection.append(str(poem))
fh.write(poem+'\n\n')
n = n+1
continue
'''
else: #最后一页也要写入
poems = html.find_all('p')
#print poems #通过<p>匹配诗文内容
for i in poems:
poem = str(i)
if '<!-- -->' in poem:
#print poem
fmt = BeautifulSoup(poem,'lxml')
#print fmt
result = fmt.get_text()
final = result.split()
for sentence in final:
#print sentence
fh.write(sentence.encode('utf-8')+'\n\n')
n = n + 1
continue
break
except:
continue
checkpoint2 = time.clock()
person = person + 1
if (checkpoint2 - checkpoint1) > 5 or person%10 is 0:
print '已抓取' + str(person) + '人'
continue
'''
for i in poemlist:
if 'class="title"' in i:
poemcollection.append(i)
'''
fh.close()
endtime = time.clock()
print '写入完毕,搜集到'+str(n)+'段唐诗'
print 'Totally spend' + str((endtime - starttime)) + 'seconds'