-
Notifications
You must be signed in to change notification settings - Fork 14
/
scrapy_twitter.py
86 lines (63 loc) · 3.16 KB
/
scrapy_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# coding:utf-8
from scrapy import log
from scrapy.http import Request, Response
import twitter
class TwitterUserTimelineRequest(Request):
def __init__(self, *args, **kwargs):
self.screen_name = kwargs.pop('screen_name', None)
self.count = kwargs.pop('count', None)
self.max_id = kwargs.pop('max_id', None)
super(TwitterUserTimelineRequest, self).__init__('http://twitter.com',
dont_filter=True,
**kwargs)
class TwitterStreamFilterRequest(Request):
def __init__(self, *args, **kwargs):
self.track = kwargs.pop('track', None)
super(TwitterStreamFilterRequest, self).__init__('http://twitter.com',
dont_filter=True,
**kwargs)
class TwitterResponse(Response):
def __init__(self, *args, **kwargs):
self.tweets = kwargs.pop('tweets', None)
super(TwitterResponse, self).__init__('http://twitter.com',
*args,
**kwargs)
class TwitterDownloaderMiddleware(object):
def __init__(self,
consumer_key, consumer_secret,
access_token_key, access_token_secret):
self.api = twitter.Api(consumer_key=consumer_key,
consumer_secret=consumer_secret,
access_token_key=access_token_key,
access_token_secret=access_token_secret)
log.msg('Using creds [CONSUMER KEY: %s, ACCESS TOKEN KEY: %s]' %
(consumer_key, access_token_key),
level=log.INFO)
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
consumer_key = settings['TWITTER_CONSUMER_KEY']
consumer_secret = settings['TWITTER_CONSUMER_SECRET']
access_token_key = settings['TWITTER_ACCESS_TOKEN_KEY']
access_token_secret = settings['TWITTER_ACCESS_TOKEN_SECRET']
return cls(consumer_key,
consumer_secret,
access_token_key,
access_token_secret)
def process_request(self, request, spider):
if isinstance(request, TwitterUserTimelineRequest):
tweets = self.api.GetUserTimeline(screen_name=request.screen_name,
count=request.count,
max_id=request.max_id)
return TwitterResponse(tweets=[tweet.AsDict() for tweet in tweets])
if isinstance(request, TwitterStreamFilterRequest):
tweets = self.api.GetStreamFilter(track=request.track)
return TwitterResponse(tweets=tweets)
def process_response(self, request, response, spider):
return response
from scrapy.item import DictItem, Field
def to_item(dict_tweet):
field_list = dict_tweet.keys()
fields = {field_name: Field() for field_name in field_list}
item_class = type('TweetItem', (DictItem,), {'fields': fields})
return item_class(dict_tweet)