Skip to content

Commit d9969bd

Browse files
committed
Initial impl
1 parent 84c2a86 commit d9969bd

20 files changed

+523
-0
lines changed

Diff for: .gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.idea
2+
*.pyc
3+
build
4+
dist

Diff for: CHANGELOG.md

Whitespace-only changes.

Diff for: CONTRIBUTING.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Contributions to the scrapy-rabbitmq code base are welcome and encouraged! Fork the repo,
2+
write your code, test your changes, then submit a pull request. Including a test URL in your PR
3+
will speed things up on our side. Thanks for contributing to this open source tool for Scrapy!

Diff for: LICENSE

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Copyright (c) 2014 Royce Haynes
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4+
5+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6+
7+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Diff for: MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include README.md

Diff for: README.md

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Use RabbitMQ to feed and queue Scrapy spiders.
2+
3+
Scrapy-rabbitmq is a tool that lets you feed and queue URLs from RabbitMQ using the [Scrapy framework](http://doc.scrapy.org/en/latest/index.html).
4+
5+
## Installation
6+
7+
Using pip, type in your command-line prompt
8+
9+
```
10+
pip install scrapy-rabbitmq
11+
```
12+
13+
Or clone the repo and inside the scrapy-rabbitmq directory, type
14+
15+
```
16+
python setup.py install
17+
```
18+
19+
## Usage
20+
21+
22+
TODO
23+
24+
## Contributing and Forking
25+
26+
TODO
27+
28+
## Releases
29+
30+
See the [changelog](CHANGELOG.md) for release details.
31+
32+
| Version | Release Date |
33+
| :-----: | :----------: |
34+
| 0.1.0 | 2014-11-14 |
35+
36+
37+
## Copyright & License
38+
39+
Copyright (c) 2014 Royce Haynes - Released under The MIT License.

Diff for: examples/README.md

Whitespace-only changes.

Diff for: scrapy_rabbitmq.egg-info/PKG-INFO

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
Metadata-Version: 1.0
2+
Name: scrapy-rabbitmq
3+
Version: 0.0.0
4+
Summary: UNKNOWN
5+
Home-page: https://github.com/roycehaynes/scrapy-rabbitmq
6+
Author: Royce Haynes
7+
Author-email: UNKNOWN
8+
License: MIT
9+
Description: UNKNOWN
10+
Platform: UNKNOWN

Diff for: scrapy_rabbitmq.egg-info/SOURCES.txt

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
MANIFEST.in
2+
README.md
3+
setup.py
4+
scrapy_rabbitmq/__init__.py
5+
scrapy_rabbitmq/connection.py
6+
scrapy_rabbitmq/dupefilter.py
7+
scrapy_rabbitmq/pipelines.py
8+
scrapy_rabbitmq/queue.py
9+
scrapy_rabbitmq/scheduler.py
10+
scrapy_rabbitmq/spiders.py
11+
scrapy_rabbitmq.egg-info/PKG-INFO
12+
scrapy_rabbitmq.egg-info/SOURCES.txt
13+
scrapy_rabbitmq.egg-info/dependency_links.txt
14+
scrapy_rabbitmq.egg-info/requires.txt
15+
scrapy_rabbitmq.egg-info/top_level.txt

Diff for: scrapy_rabbitmq.egg-info/dependency_links.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

Diff for: scrapy_rabbitmq.egg-info/requires.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pika
2+
Scrapy>=0.14

Diff for: scrapy_rabbitmq.egg-info/top_level.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
scrapy_rabbitmq

Diff for: scrapy_rabbitmq/__init__.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
Scrapy RabbitMQ
5+
~~~~~~~~~~~~~~~~~~~~~~
6+
7+
TODO
8+
9+
Usage:
10+
11+
TODO
12+
13+
"""
14+
15+
__title__ = 'scrapy-rabbitmq'
16+
__version__ = '0.1.0'
17+
__author__ = 'Royce Haynes'
18+
__copyright__ = 'Copyright 2014 Royce Haynes'

Diff for: scrapy_rabbitmq/connection.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# -*- coding: utf-8 -*-
2+
3+
try:
4+
import pika
5+
except ImportError:
6+
raise ImportError("Please install pika before running scrapy-rabbitmq.")
7+
8+
9+
RABBITMQ_CONNECTION_TYPE = 'blocking'
10+
RABBITMQ_QUEUE_NAME = 'scrapy_queue'
11+
RABBITMQ_CONNECTION_PARAMETERS = {'host': 'localhost'}
12+
13+
14+
def from_settings(settings):
15+
""" Factory method that returns an instance of channel
16+
17+
:param str connection_type: This field can be `blocking`
18+
`asyncore`, `libev`, `select`, `tornado`, or `twisted`
19+
20+
See pika documentation for more details:
21+
TODO: put pika url regarding connection type
22+
23+
Parameters is a dictionary that can
24+
include the following values:
25+
26+
:param str host: Hostname or IP Address to connect to
27+
:param int port: TCP port to connect to
28+
:param str virtual_host: RabbitMQ virtual host to use
29+
:param pika.credentials.Credentials credentials: auth credentials
30+
:param int channel_max: Maximum number of channels to allow
31+
:param int frame_max: The maximum byte size for an AMQP frame
32+
:param int heartbeat_interval: How often to send heartbeats
33+
:param bool ssl: Enable SSL
34+
:param dict ssl_options: Arguments passed to ssl.wrap_socket as
35+
:param int connection_attempts: Maximum number of retry attempts
36+
:param int|float retry_delay: Time to wait in seconds, before the next
37+
:param int|float socket_timeout: Use for high latency networks
38+
:param str locale: Set the locale value
39+
:param bool backpressure_detection: Toggle backpressure detection
40+
41+
:return: Channel object
42+
"""
43+
44+
connection_type = settings.get('RABBITMQ_CONNECTION_TYPE', RABBITMQ_CONNECTION_TYPE)
45+
queue_name = settings.get('RABBITMQ_QUEUE_NAME', RABBITMQ_QUEUE_NAME)
46+
connection_parameters = settings.get('RABBITMQ_CONNECTION_PARAMETERS', RABBITMQ_CONNECTION_PARAMETERS)
47+
48+
connection = {
49+
'blocking': pika.BlockingConnection,
50+
'asyncore': pika.AsyncoreConnection,
51+
'libev': pika.LibevConnection,
52+
'select': pika.SelectConnection,
53+
'tornado': pika.TornadoConnection,
54+
'twisted': pika.TwistedConnection
55+
}[connection_type](pika.ConnectionParameters(**connection_parameters))
56+
57+
channel = connection.channel()
58+
channel.queue_declare(queue=queue_name, durable=True)
59+
60+
return channel
61+
62+
63+

Diff for: scrapy_rabbitmq/dupefilter.py

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
__author__ = 'roycehaynes'
2+
3+
from scrapy.dupefilter import BaseDupeFilter
4+
5+
import time
6+
import connection
7+
8+
from scrapy.dupefilter import BaseDupeFilter
9+
from scrapy.utils.request import request_fingerprint
10+
11+
12+
class RFPDupeFilter(BaseDupeFilter):
13+
"""RabbitMQ-based request duplication filter"""
14+
15+
def __init__(self, server, key):
16+
"""Initialize duplication filter
17+
18+
Parameters
19+
----------
20+
server : RabbitMQ instance
21+
key : str
22+
Where to store fingerprints
23+
"""
24+
self.server = server
25+
self.key = key
26+
27+
@classmethod
28+
def from_settings(cls, settings):
29+
server = connection.from_settings(settings)
30+
# create one-time key. needed to support to use this
31+
# class as standalone dupefilter with scrapy's default scheduler
32+
# if scrapy passes spider on open() method this wouldn't be needed
33+
key = "dupefilter:%s" % int(time.time())
34+
return cls(server, key)
35+
36+
@classmethod
37+
def from_crawler(cls, crawler):
38+
return cls.from_settings(crawler.settings)
39+
40+
def request_seen(self, request):
41+
fp = request_fingerprint(request)
42+
43+
added = self.server.basic_publish(
44+
exchange='',
45+
routing_key=self.key,
46+
body=fp
47+
)
48+
49+
return not added
50+
51+
def close(self, reason):
52+
"""Delete data on close. Called by scrapy's scheduler"""
53+
self.clear()
54+
55+
def clear(self):
56+
"""Clears fingerprints data"""
57+
self.server.queue_purge(self.key)
58+

Diff for: scrapy_rabbitmq/pipelines.py

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
2+
import connection
3+
4+
from twisted.internet.threads import deferToThread
5+
from scrapy.utils.serialize import ScrapyJSONEncoder
6+
7+
8+
class RabbitMQPipeline(object):
9+
"""Pushes serialized item into a RabbitMQ list/queue"""
10+
11+
def __init__(self, server):
12+
self.server = server
13+
self.encoder = ScrapyJSONEncoder()
14+
15+
@classmethod
16+
def from_settings(cls, settings):
17+
server = connection.from_settings(settings)
18+
return cls(server)
19+
20+
@classmethod
21+
def from_crawler(cls, crawler):
22+
return cls.from_settings(crawler.settings)
23+
24+
def process_item(self, item, spider):
25+
return deferToThread(self._process_item, item, spider)
26+
27+
def _process_item(self, item, spider):
28+
key = self.item_key(item, spider)
29+
data = self.encoder.encode(item)
30+
self.server.basic_publish(exchange='',
31+
routing_key=key,
32+
body=data)
33+
return item
34+
35+
def item_key(self, item, spider):
36+
"""Returns RabbitMQ key based on given spider"""
37+
return "%s:items" % spider.name

Diff for: scrapy_rabbitmq/queue.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from scrapy.utils.reqser import request_to_dict, request_from_dict
2+
3+
try:
4+
import cPickle as pickle
5+
except ImportError:
6+
import pickle
7+
8+
9+
class Base(object):
10+
"""Per-spider queue/stack base class"""
11+
12+
def __init__(self, server, spider, key, exchange):
13+
"""Initialize per-spider RabbitMQ queue.
14+
15+
Parameters:
16+
server -- rabbitmq connection
17+
spider -- spider instance
18+
key -- key for this queue (e.g. "%(spider)s:queue")
19+
"""
20+
self.server = server
21+
self.spider = spider
22+
self.key = key % {'spider': spider.name}
23+
24+
def _encode_request(self, request):
25+
"""Encode a request object"""
26+
return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
27+
28+
def _decode_request(self, encoded_request):
29+
"""Decode an request previously encoded"""
30+
return request_from_dict(pickle.loads(encoded_request), self.spider)
31+
32+
def __len__(self):
33+
"""Return the length of the queue"""
34+
raise NotImplementedError
35+
36+
def push(self, request):
37+
"""Push a request"""
38+
raise NotImplementedError
39+
40+
def pop(self, timeout=0):
41+
"""Pop a request"""
42+
raise NotImplementedError
43+
44+
def clear(self):
45+
"""Clear queue/stack"""
46+
self.server.queue_purge(self.key)
47+
48+
49+
class SpiderQueue(object):
50+
"""Per-spider FIFO queue"""
51+
52+
def __len__(self):
53+
"""Return the length of the queue"""
54+
response = self.server.queue_declare(self.key, passive=True)
55+
return response.message_count
56+
57+
def push(self, request):
58+
"""Push a request"""
59+
self.server.basic_publish(
60+
exchange='',
61+
routing_key=self.key,
62+
body=request
63+
)
64+
65+
def pop(self):
66+
"""Pop a request"""
67+
68+
method_frame, header, body = self.server.basic_get(queue=self.key)
69+
70+
if body:
71+
return self._decode_request(body)
72+
73+
74+
__all__ = ['SpiderQueue']

0 commit comments

Comments
 (0)