Skip to content

Commit fab019f

Browse files
committed
itchat获取微信公众号文章
1 parent d458c48 commit fab019f

File tree

3 files changed

+105
-0
lines changed

3 files changed

+105
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
#coding=utf8
2+
import pickle
3+
import wechatsogou
4+
import urllib2
5+
import lxml.etree
6+
import os
7+
import pymysql
8+
import json
9+
10+
# 添加一个文件,将已经发送成功的文章标题序列化到文件,防止多次运行导致重复发送邮件
11+
file_path = 'sent_articles_file'
12+
13+
ws_api = wechatsogou.WechatSogouAPI()
14+
15+
# 连接数据库
16+
tablename = 'pythonwechat'
17+
db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8')
18+
cur = db.cursor()
19+
cur.execute('USE itchat')
20+
21+
# 获取公众号文章信息
22+
def get_article(gzh):
23+
articles = ws_api.get_gzh_article_by_history(gzh)
24+
print(len(articles['article']))
25+
return articles['article']
26+
27+
# 获取网页内容
28+
def get_html(url):
29+
request = urllib2.Request(url)
30+
response = urllib2.urlopen(request)
31+
html = response.read()
32+
return html
33+
34+
# 下载图片
35+
def get_image(title,imgArray,source,time):
36+
if os.path.isdir('./imgs'):
37+
pass
38+
else:
39+
os.mkdir("./imgs")
40+
for item in imgArray:
41+
with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file:
42+
file.write(get_html(item))
43+
file.close
44+
45+
cur.execute(
46+
'INSERT INTO ' + tablename + ' (title, img,source,time) VALUES (%s, %s,%s, %s)',
47+
(title[0].strip().replace("\n", ""), json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", "")))
48+
cur.connection.commit()
49+
print title[0]
50+
print("------------------------ 插入成功 ----------------------------------")
51+
52+
# 连接数据库
53+
def get_connect():
54+
55+
try:
56+
# 创建表
57+
cur.execute(
58+
'CREATE TABLE ' + tablename + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))')
59+
except pymysql.err.InternalError as e:
60+
print(e)
61+
# 修改表字段
62+
cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci')
63+
cur.execute(
64+
'ALTER TABLE ' + tablename + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
65+
cur.execute(
66+
'ALTER TABLE ' + tablename + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
67+
cur.execute(
68+
'ALTER TABLE ' + tablename + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
69+
cur.execute(
70+
'ALTER TABLE ' + tablename + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
71+
cur.execute(
72+
'ALTER TABLE ' + tablename + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci')
73+
74+
75+
if '__main__' == __name__:
76+
77+
get_connect()
78+
79+
# 定义一个公众号列表
80+
gzh_list = ['技术最前线', 'python', '全民独立经纪人', '程序视界', '非著名程序员']
81+
82+
for gzh in gzh_list:
83+
# 查找公众号之前,先从文件中反序列化出已经成功发送的文章列表
84+
if os.path.exists(file_path):
85+
f = open(file_path, 'rb')
86+
sent_list = pickle.load(f)
87+
f.close()
88+
articles = get_article(gzh)
89+
for article in articles:
90+
print(article['title'],'\n\t' ,article['content_url'])
91+
92+
xmlcontent = lxml.etree.HTML(get_html(article['content_url']))
93+
title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()')
94+
imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src')
95+
# 来源
96+
source = xmlcontent.xpath(
97+
'//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()')
98+
time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()')
99+
print "来源、时间"
100+
print source, time
101+
# 下载图片
102+
print "下载图片"
103+
get_image(title, imgArray, source, time)
104+

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
itchat微信接口
33

44
01 itchat获取微信好友或者微信群分享文章<br>
5+
02 itchat获取微信公众号文章<br>

0 commit comments

Comments
 (0)