Initial impl

roycehaynes · roycehaynes · commit d9969bd1983e · 2014-11-14T11:58:13.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.idea
+*.pyc
+build
+dist
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,3 @@
+Contributions to the scrapy-rabbitmq code base are welcome and encouraged! Fork the repo, 
+write your code, test your changes, then submit a pull request. Including a test URL in your PR 
+will speed things up on our side. Thanks for contributing to this open source tool for Scrapy!
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,7 @@
+Copyright (c) 2014 Royce Haynes
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include README.md
diff --git a/README.md b/README.md
@@ -0,0 +1,39 @@
+# Use RabbitMQ to feed and queue Scrapy spiders.
+
+Scrapy-rabbitmq is a tool that lets you feed and queue URLs from RabbitMQ using the [Scrapy framework](http://doc.scrapy.org/en/latest/index.html).
+
+## Installation
+
+Using pip, type in your command-line prompt
+
+```
+pip install scrapy-rabbitmq
+```
+ 
+Or clone the repo and inside the scrapy-rabbitmq directory, type
+
+```
+python setup.py install
+```
+
+## Usage
+
+
+TODO
+
+## Contributing and Forking
+
+TODO
+
+## Releases
+
+See the [changelog](CHANGELOG.md) for release details.
+
+| Version | Release Date |
+| :-----: | :----------: |
+| 0.1.0 | 2014-11-14 |
+
+
+## Copyright & License
+
+Copyright (c) 2014 Royce Haynes - Released under The MIT License.
diff --git a/examples/README.md b/examples/README.md
diff --git a/scrapy_rabbitmq.egg-info/PKG-INFO b/scrapy_rabbitmq.egg-info/PKG-INFO
@@ -0,0 +1,10 @@
+Metadata-Version: 1.0
+Name: scrapy-rabbitmq
+Version: 0.0.0
+Summary: UNKNOWN
+Home-page: https://github.com/roycehaynes/scrapy-rabbitmq
+Author: Royce Haynes
+Author-email: UNKNOWN
+License: MIT
+Description: UNKNOWN
+Platform: UNKNOWN
diff --git a/scrapy_rabbitmq.egg-info/SOURCES.txt b/scrapy_rabbitmq.egg-info/SOURCES.txt
@@ -0,0 +1,15 @@
+MANIFEST.in
+README.md
+setup.py
+scrapy_rabbitmq/__init__.py
+scrapy_rabbitmq/connection.py
+scrapy_rabbitmq/dupefilter.py
+scrapy_rabbitmq/pipelines.py
+scrapy_rabbitmq/queue.py
+scrapy_rabbitmq/scheduler.py
+scrapy_rabbitmq/spiders.py
+scrapy_rabbitmq.egg-info/PKG-INFO
+scrapy_rabbitmq.egg-info/SOURCES.txt
+scrapy_rabbitmq.egg-info/dependency_links.txt
+scrapy_rabbitmq.egg-info/requires.txt
+scrapy_rabbitmq.egg-info/top_level.txt
diff --git a/scrapy_rabbitmq.egg-info/dependency_links.txt b/scrapy_rabbitmq.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/scrapy_rabbitmq.egg-info/requires.txt b/scrapy_rabbitmq.egg-info/requires.txt
@@ -0,0 +1,2 @@
+pika
+Scrapy>=0.14
diff --git a/scrapy_rabbitmq.egg-info/top_level.txt b/scrapy_rabbitmq.egg-info/top_level.txt
@@ -0,0 +1 @@
+scrapy_rabbitmq
diff --git a/scrapy_rabbitmq/__init__.py b/scrapy_rabbitmq/__init__.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+
+"""
+Scrapy RabbitMQ
+~~~~~~~~~~~~~~~~~~~~~~
+
+TODO
+
+Usage:
+
+TODO
+
+"""
+
+__title__ = 'scrapy-rabbitmq'
+__version__ = '0.1.0'
+__author__ = 'Royce Haynes'
+__copyright__ = 'Copyright 2014 Royce Haynes'
diff --git a/scrapy_rabbitmq/connection.py b/scrapy_rabbitmq/connection.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+try:
+    import pika
+except ImportError:
+    raise ImportError("Please install pika before running scrapy-rabbitmq.")
+
+
+RABBITMQ_CONNECTION_TYPE = 'blocking'
+RABBITMQ_QUEUE_NAME = 'scrapy_queue'
+RABBITMQ_CONNECTION_PARAMETERS = {'host': 'localhost'}
+
+
+def from_settings(settings):
+    """ Factory method that returns an instance of channel
+    
+        :param str connection_type: This field can be `blocking`
+            `asyncore`, `libev`, `select`, `tornado`, or `twisted`
+
+        See pika documentation for more details:
+            TODO: put pika url regarding connection type
+
+        Parameters is a dictionary that can 
+        include the following values:
+
+            :param str host: Hostname or IP Address to connect to
+            :param int port: TCP port to connect to
+            :param str virtual_host: RabbitMQ virtual host to use
+            :param pika.credentials.Credentials credentials: auth credentials
+            :param int channel_max: Maximum number of channels to allow
+            :param int frame_max: The maximum byte size for an AMQP frame
+            :param int heartbeat_interval: How often to send heartbeats
+            :param bool ssl: Enable SSL
+            :param dict ssl_options: Arguments passed to ssl.wrap_socket as
+            :param int connection_attempts: Maximum number of retry attempts
+            :param int|float retry_delay: Time to wait in seconds, before the next
+            :param int|float socket_timeout: Use for high latency networks
+            :param str locale: Set the locale value
+            :param bool backpressure_detection: Toggle backpressure detection
+
+        :return: Channel object
+    """
+
+    connection_type = settings.get('RABBITMQ_CONNECTION_TYPE', RABBITMQ_CONNECTION_TYPE)
+    queue_name = settings.get('RABBITMQ_QUEUE_NAME', RABBITMQ_QUEUE_NAME)
+    connection_parameters = settings.get('RABBITMQ_CONNECTION_PARAMETERS', RABBITMQ_CONNECTION_PARAMETERS)
+
+    connection = {
+        'blocking': pika.BlockingConnection,
+        'asyncore': pika.AsyncoreConnection,
+        'libev': pika.LibevConnection,
+        'select': pika.SelectConnection,
+        'tornado': pika.TornadoConnection,
+        'twisted': pika.TwistedConnection
+    }[connection_type](pika.ConnectionParameters(**connection_parameters))
+
+    channel = connection.channel()
+    channel.queue_declare(queue=queue_name, durable=True)
+
+    return channel
+
+
+
diff --git a/scrapy_rabbitmq/dupefilter.py b/scrapy_rabbitmq/dupefilter.py
@@ -0,0 +1,58 @@
+__author__ = 'roycehaynes'
+
+from scrapy.dupefilter import BaseDupeFilter
+
+import time
+import connection
+
+from scrapy.dupefilter import BaseDupeFilter
+from scrapy.utils.request import request_fingerprint
+
+
+class RFPDupeFilter(BaseDupeFilter):
+    """RabbitMQ-based request duplication filter"""
+
+    def __init__(self, server, key):
+        """Initialize duplication filter
+
+        Parameters
+        ----------
+        server : RabbitMQ instance
+        key : str
+            Where to store fingerprints
+        """
+        self.server = server
+        self.key = key
+
+    @classmethod
+    def from_settings(cls, settings):
+        server = connection.from_settings(settings)
+        # create one-time key. needed to support to use this
+        # class as standalone dupefilter with scrapy's default scheduler
+        # if scrapy passes spider on open() method this wouldn't be needed
+        key = "dupefilter:%s" % int(time.time())
+        return cls(server, key)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls.from_settings(crawler.settings)
+
+    def request_seen(self, request):
+        fp = request_fingerprint(request)
+
+        added = self.server.basic_publish(
+            exchange='',
+            routing_key=self.key,
+            body=fp
+        )
+
+        return not added
+
+    def close(self, reason):
+        """Delete data on close. Called by scrapy's scheduler"""
+        self.clear()
+
+    def clear(self):
+        """Clears fingerprints data"""
+        self.server.queue_purge(self.key)
+
diff --git a/scrapy_rabbitmq/pipelines.py b/scrapy_rabbitmq/pipelines.py
@@ -0,0 +1,37 @@
+
+import connection
+
+from twisted.internet.threads import deferToThread
+from scrapy.utils.serialize import ScrapyJSONEncoder
+
+
+class RabbitMQPipeline(object):
+    """Pushes serialized item into a RabbitMQ list/queue"""
+
+    def __init__(self, server):
+        self.server = server
+        self.encoder = ScrapyJSONEncoder()
+
+    @classmethod
+    def from_settings(cls, settings):
+        server = connection.from_settings(settings)
+        return cls(server)
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls.from_settings(crawler.settings)
+
+    def process_item(self, item, spider):
+        return deferToThread(self._process_item, item, spider)
+
+    def _process_item(self, item, spider):
+        key = self.item_key(item, spider)
+        data = self.encoder.encode(item)
+        self.server.basic_publish(exchange='',
+                                  routing_key=key,
+                                  body=data)
+        return item
+
+    def item_key(self, item, spider):
+        """Returns RabbitMQ key based on given spider"""
+        return "%s:items" % spider.name
diff --git a/scrapy_rabbitmq/queue.py b/scrapy_rabbitmq/queue.py
@@ -0,0 +1,74 @@
+from scrapy.utils.reqser import request_to_dict, request_from_dict
+
+try:
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+
+class Base(object):
+    """Per-spider queue/stack base class"""
+
+    def __init__(self, server, spider, key, exchange):
+        """Initialize per-spider RabbitMQ queue.
+
+        Parameters:
+            server -- rabbitmq connection
+            spider -- spider instance
+            key -- key for this queue (e.g. "%(spider)s:queue")
+        """
+        self.server = server
+        self.spider = spider
+        self.key = key % {'spider': spider.name}
+
+    def _encode_request(self, request):
+        """Encode a request object"""
+        return pickle.dumps(request_to_dict(request, self.spider), protocol=-1)
+
+    def _decode_request(self, encoded_request):
+        """Decode an request previously encoded"""
+        return request_from_dict(pickle.loads(encoded_request), self.spider)
+
+    def __len__(self):
+        """Return the length of the queue"""
+        raise NotImplementedError
+
+    def push(self, request):
+        """Push a request"""
+        raise NotImplementedError
+
+    def pop(self, timeout=0):
+        """Pop a request"""
+        raise NotImplementedError
+
+    def clear(self):
+        """Clear queue/stack"""
+        self.server.queue_purge(self.key)
+
+
+class SpiderQueue(object):
+    """Per-spider FIFO queue"""
+
+    def __len__(self):
+        """Return the length of the queue"""
+        response = self.server.queue_declare(self.key, passive=True)
+        return response.message_count
+
+    def push(self, request):
+        """Push a request"""
+        self.server.basic_publish(
+            exchange='',
+            routing_key=self.key,
+            body=request
+        )
+
+    def pop(self):
+        """Pop a request"""
+
+        method_frame, header, body = self.server.basic_get(queue=self.key)
+
+        if body:
+            return self._decode_request(body)
+
+
+__all__ = ['SpiderQueue']
diff --git a/scrapy_rabbitmq/scheduler.py b/scrapy_rabbitmq/scheduler.py
diff --git a/scrapy_rabbitmq/spiders.py b/scrapy_rabbitmq/spiders.py
diff --git a/setup.py b/setup.py

-Original file line number
+Diff line change
 +.idea
 +*.pyc
 +build
 +dist
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Contributions to the scrapy-rabbitmq code base are welcome and encouraged! Fork the repo,`
	`2`	`+write your code, test your changes, then submit a pull request. Including a test URL in your PR`
	`3`	`+will speed things up on our side. Thanks for contributing to this open source tool for Scrapy!`