|
| 1 | +#coding=utf8 |
| 2 | +import pickle |
| 3 | +import wechatsogou |
| 4 | +import urllib2 |
| 5 | +import lxml.etree |
| 6 | +import os |
| 7 | +import pymysql |
| 8 | +import json |
| 9 | + |
| 10 | +# 添加一个文件,将已经发送成功的文章标题序列化到文件,防止多次运行导致重复发送邮件 |
| 11 | +file_path = 'sent_articles_file' |
| 12 | + |
| 13 | +ws_api = wechatsogou.WechatSogouAPI() |
| 14 | + |
| 15 | +# 连接数据库 |
| 16 | +tablename = 'pythonwechat' |
| 17 | +db = pymysql.connect(host='127.0.0.1', user='root', passwd='', db='itchat', charset='utf8') |
| 18 | +cur = db.cursor() |
| 19 | +cur.execute('USE itchat') |
| 20 | + |
| 21 | +# 获取公众号文章信息 |
| 22 | +def get_article(gzh): |
| 23 | + articles = ws_api.get_gzh_article_by_history(gzh) |
| 24 | + print(len(articles['article'])) |
| 25 | + return articles['article'] |
| 26 | + |
| 27 | +# 获取网页内容 |
| 28 | +def get_html(url): |
| 29 | + request = urllib2.Request(url) |
| 30 | + response = urllib2.urlopen(request) |
| 31 | + html = response.read() |
| 32 | + return html |
| 33 | + |
| 34 | +# 下载图片 |
| 35 | +def get_image(title,imgArray,source,time): |
| 36 | + if os.path.isdir('./imgs'): |
| 37 | + pass |
| 38 | + else: |
| 39 | + os.mkdir("./imgs") |
| 40 | + for item in imgArray: |
| 41 | + with open('imgs/' + (item)[-30:].replace('/','-') + ".png", 'a+') as file: |
| 42 | + file.write(get_html(item)) |
| 43 | + file.close |
| 44 | + |
| 45 | +cur.execute( |
| 46 | + 'INSERT INTO ' + tablename + ' (title, img,source,time) VALUES (%s, %s,%s, %s)', |
| 47 | + (title[0].strip().replace("\n", ""), json.dumps(imgArray, ensure_ascii=False),source[0].strip().replace("\n", ""),time[0].strip().replace("\n", ""))) |
| 48 | +cur.connection.commit() |
| 49 | +print title[0] |
| 50 | +print("------------------------ 插入成功 ----------------------------------") |
| 51 | + |
| 52 | +# 连接数据库 |
| 53 | +def get_connect(): |
| 54 | + |
| 55 | + try: |
| 56 | + # 创建表 |
| 57 | + cur.execute( |
| 58 | + 'CREATE TABLE ' + tablename + ' (id BIGINT(7) NOT NULL AUTO_INCREMENT, title VARCHAR(1000), img VARCHAR(1000), source VARCHAR(1000), time VARCHAR(1000), created TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY(id))') |
| 59 | + except pymysql.err.InternalError as e: |
| 60 | + print(e) |
| 61 | + # 修改表字段 |
| 62 | + cur.execute('ALTER DATABASE itchat CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci') |
| 63 | + cur.execute( |
| 64 | + 'ALTER TABLE ' + tablename + ' CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') |
| 65 | +cur.execute( |
| 66 | + 'ALTER TABLE ' + tablename + ' CHANGE title title VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') |
| 67 | + cur.execute( |
| 68 | + 'ALTER TABLE ' + tablename + ' CHANGE img img VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') |
| 69 | + cur.execute( |
| 70 | + 'ALTER TABLE ' + tablename + ' CHANGE source source VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') |
| 71 | + cur.execute( |
| 72 | + 'ALTER TABLE ' + tablename + ' CHANGE time time VARCHAR(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci') |
| 73 | + |
| 74 | + |
| 75 | +if '__main__' == __name__: |
| 76 | + |
| 77 | + get_connect() |
| 78 | + |
| 79 | + # 定义一个公众号列表 |
| 80 | + gzh_list = ['技术最前线', 'python', '全民独立经纪人', '程序视界', '非著名程序员'] |
| 81 | + |
| 82 | + for gzh in gzh_list: |
| 83 | + # 查找公众号之前,先从文件中反序列化出已经成功发送的文章列表 |
| 84 | + if os.path.exists(file_path): |
| 85 | + f = open(file_path, 'rb') |
| 86 | + sent_list = pickle.load(f) |
| 87 | + f.close() |
| 88 | + articles = get_article(gzh) |
| 89 | + for article in articles: |
| 90 | + print(article['title'],'\n\t' ,article['content_url']) |
| 91 | + |
| 92 | + xmlcontent = lxml.etree.HTML(get_html(article['content_url'])) |
| 93 | + title = xmlcontent.xpath('//h2[@class="rich_media_title"]/text()') |
| 94 | + imgArray = xmlcontent.xpath('//img[@data-type="png"]/@data-src') |
| 95 | + # 来源 |
| 96 | + source = xmlcontent.xpath( |
| 97 | + '//span[@class="rich_media_meta rich_media_meta_text rich_media_meta_nickname"]/text()') |
| 98 | + time = xmlcontent.xpath('//em[@class="rich_media_meta rich_media_meta_text"]/text()') |
| 99 | + print "来源、时间" |
| 100 | + print source, time |
| 101 | + # 下载图片 |
| 102 | + print "下载图片" |
| 103 | + get_image(title, imgArray, source, time) |
| 104 | + |
0 commit comments