pypi 0.2.1 release, added url to WorkGroup parameter

bmjjr · bmjjr · commit f2153d36f73c · 2018-11-29T14:46:04.000+08:00
diff --git a/CHANGES b/CHANGES
@@ -3,9 +3,19 @@
 ========================
 
 11/29/18
-- pypi 0.2.0 release
+- pypi 0.2.1 release
+
+- added `url` parameter to the WorkGroup which is a bit more attractive
+API, instead of including the url in a kwarg.  The reason why the url was
+originally included as a kwarg is because depending on how the custom
+Spider is setup, the url may already be specified, and it is redundant to
+specify it again.  But for API clarity sake, now we just insist the url is
+specified in the WorkGroup. At least, it is easier to read at a quick glance.
 
 11/28/18
+
+- pypi 0.2.0 release
+
 More API breaking changes:
 
 - previously, the Worker needed to be explicitly defined in the
diff --git a/README.rst b/README.rst
@@ -5,8 +5,8 @@
 
 .. image:: https://img.shields.io/badge/Python-3.6%20%7C%203.7-blue.svg
   :target: https://github.com/bomquote/transistor
-.. image:: https://img.shields.io/badge/pypi%20package-0.2.0-blue.svg
-  :target: https://pypi.org/project/transistor/0.2.0/
+.. image:: https://img.shields.io/badge/pypi%20package-0.2.1-blue.svg
+  :target: https://pypi.org/project/transistor/0.2.1/
 .. image:: https://img.shields.io/badge/Status-Beta-blue.svg
   :target: https://github.com/bomquote/transistor
 .. image:: https://img.shields.io/badge/license-MIT-lightgrey.svg
@@ -167,13 +167,13 @@ Quickstart
 
 First, install ``Transistor`` from pypi:
 
-.. code-block:: python
+.. code-block:: rest
 
     pip install transistor
 
 If you have previously installed ``Transistor``, please ensure you are using the latest version:
 
-.. code-block:: python
+.. code-block:: rest
 
     pip-install --upgrade transistor
 
@@ -404,7 +404,7 @@ Specifically, we are interested in the `book_title`, `stock` and `price` attribu
 
             return self.items
 
-Finally, to run the scrape, we will need to create a main.py file.  This is all we need for the minimal example to scrape and export targeted data to cvs.
+Finally, to run the scrape, we will need to create a main.py file.  This is all we need for the minimal example to scrape and export targeted data to csv.
 
 So, at this point, we've:
 
@@ -455,12 +455,13 @@ Third, setup the ``WorkGroup`` in a list we'll call *groups*. We use a list here
     groups = [
     WorkGroup(
         name='books.toscrape.com',
+        url='http://books.toscrape.com/',
         spider=BooksToScrapeScraper,
         items=BookItems,
         loader=BookItemsLoader,
         exporters=exporters,
         workers=20,  # this creates 20 scrapers and assigns each a book as a task
-        kwargs={'url': 'http://books.toscrape.com/', 'timeout': (3.0, 20.0)})
+        kwargs={'timeout': (3.0, 20.0)})
     ]
 
 Last, setup the ``WorkGroupManager`` and prepare the file to call the ``manager.main()`` method to start the scrape job:
@@ -506,7 +507,7 @@ Directly Using A SplashScraper
 
 Perhaps you just want to do a quick one-off scrape?
 
-It is possible to just use your custom scraper sublcassed from ``SplashScraper`` directly, without going through all the work to setup a ``StatefulBook``, ``BaseWorker``, ``BaseGroup``, ``WorkGroup``, and ``WorkGroupManager``.
+It is possible to just use your custom scraper subclassed from ``SplashScraper`` directly, without going through all the work to setup a ``StatefulBook``, ``BaseWorker``, ``BaseGroup``, ``WorkGroup``, and ``WorkGroupManager``.
 
 Just fire it up in a python repl like below and ensure the ``start_http_session`` method is run, which can generally be done by setting ``autorun=True``.
 
@@ -566,25 +567,25 @@ Next, we need to store our first two python objects in newt.db, which are:
 
 .. code-block:: python
 
-    from transistor.persistence.newt_db.collections import ScrapeList, ScrapeLists
+    from transistor.persistence.newt_db.collections import SpiderList, SpiderLists
 
 Now, from your python repl:
 
 .. code-block:: python
 
     from transistor.newt_db import ndb
 
-    >>> ndb.root.scrapes = ScrapeLists()  # Assigning ScrapeLists() is only required during initial seup. Or else, when/if you change the ScrapeLists() object, for example, to provide more functionality to the class.
-    >>> ndb.root.scrapes.add('first-scrape', ScrapeList())  # You will add a new ScrapeList() anytime you need a new list container. Like, every single scrape you save.  See ``process_exports`` method in ``examples/books_to_scrape/workgroup.py``.
+    >>> ndb.root.spiders = SpiderLists()  # Assigning SpiderLists() is only required during initial setup. Or else, when/if you change the SpiderLists() object, for example, to provide more functionality to the class.
+    >>> ndb.root.spiders.add('first-scrape', SpiderList())  # You will add a new SpiderList() anytime you need a new list container. Like, every single scrape you save.  See ``process_exports`` method in ``examples/books_to_scrape/workgroup.py``.
     >>> ndb.commit() # you must explicitly commit() after each change to newt.db.
 
 At this point, you are ready-to-go with newt.db and PostgreSQL.
 
-Later, when you have a scraper object instance, such as ``BooksToScrapeScraper()`` which has finished it's web scrape cycle, it will be stored in the ``ScrapeList()`` named ``first-scrape`` like such:
+Later, when you have a scraper object instance, such as ``BooksToScrapeScraper()`` which has finished it's web scrape cycle, it will be stored in the ``SpiderList()`` named ``first-scrape`` like such:
 
 .. code-block:: python
 
-        >>> ndb.root.scrapes['first-scrape'].add(BooksToScrapeScraper(name="books.toscrape.com", book_title="Soumission"))
+        >>> ndb.root.spiders['first-scrape'].add(BooksToScrapeScraper(name="books.toscrape.com", book_title="Soumission"))
 
 
 More on StatefulBook
diff --git a/dist/transistor-0.2.0.tar.gz b/dist/transistor-0.2.0.tar.gz
diff --git a/dist/transistor-0.2.1-py3-none-any.whl b/dist/transistor-0.2.1-py3-none-any.whl
diff --git a/dist/transistor-0.2.1.tar.gz b/dist/transistor-0.2.1.tar.gz
diff --git a/examples/books_to_scrape/main.py b/examples/books_to_scrape/main.py
@@ -53,14 +53,16 @@
 # finally, the core of what we need to launch the scrape job
 from transistor import WorkGroup, StatefulBook
 from transistor.persistence.exporters import CsvItemExporter
+from transistor.persistence.exporters.json import JsonLinesItemExporter
 from examples.books_to_scrape.workgroup import BooksWorker
 from examples.books_to_scrape.scraper import BooksToScrapeScraper
 from examples.books_to_scrape.manager import BooksWorkGroupManager
 from examples.books_to_scrape.persistence.serialization import (
     BookItems, BookItemsLoader)
 
 
-# 1) get the excel file path which has the book_titles we are interested to scrape
+# 1) Get the excel file path which has the book_titles we are interested to scrape.
+
 def get_file_path(filename):
     """
     Find the book_title excel file path.
@@ -70,42 +72,52 @@ def get_file_path(filename):
     filepath = root / 'books_to_scrape' / filename
     return r'{}'.format(filepath)
 
-
 # 2) Create a StatefulBook instance to read the excel file and load the work queue.
 # Set a list of tracker names, with one tracker name for each WorkGroup you create
-# in step three. Ensure the tracker name matches the WorkGroup.name in step three.
+# in step four. Ensure the tracker name matches the WorkGroup.name in step four.
+
 file = get_file_path('book_titles.xlsx')
 trackers = ['books.toscrape.com']
-stateful_book = StatefulBook(file, trackers, keywords='titles', autorun=True)
+tasks = StatefulBook(file, trackers, keywords='titles', autorun=True)
+
+# 3) Setup a list of exporters which than then be passed to whichever WorkGroup
+# objects you want to use them with. In this case, we are just going to use the
+# built-in CsvItemExporter but we could also use additional exporters to do
+# multiple exports at the same time, if desired.
 
+exporters = [CsvItemExporter(
+                fields_to_export=['book_title', 'stock', 'price'],
+                file=open('c:/tmp/book_data.csv', 'a+b')),
+             JsonLinesItemExporter(
+                fields_to_export=['book_title', 'stock', 'price'],
+                file=open('c:/tmp/book_data.json', 'a+b'),
+                encoding='utf_8_sig')]
 
-# 3) Setup the WorkGroups. You can create an arbitrary number of WorkGroups in a list.
+# 4) Setup the WorkGroups. You can create an arbitrary number of WorkGroups in a list.
 # For example, if there are three different domains which you want to search for
-# the book titles from the excel file. To, scrape the price and stock data on
-# each of the three different websites for each book title. You could setup three
+# the book titles from the excel file. If you wanted to scrape the price and stock data
+# on each of the three different websites for each book title. You could setup three
 # different WorkGroups here. Last, the WorkGroup.name should match the tracker name.
+
 groups = [
     WorkGroup(
         name='books.toscrape.com',
+        url='http://books.toscrape.com/',
         spider=BooksToScrapeScraper,
         worker=BooksWorker,
         items=BookItems,
         loader=BookItemsLoader,
-        exporters=[
-            CsvItemExporter(
-                fields_to_export=['book_title', 'stock', 'price'],
-                file=open('c:/tmp/book_data.csv', 'a+b'))
-        ],
+        exporters=exporters,
         workers=20,  # this creates 20 scrapers and assigns each a book as a task
-        kwargs={'url': 'http://books.toscrape.com/', 'timeout': (3.0, 20.0)})
+        kwargs={'timeout': (3.0, 20.0)})
     ]
 
-# 4) Last, setup the Manager. You can constrain the number of workers actually
+# 5) Last, setup the Manager. You can constrain the number of workers actually
 # deployed, through the `pool` parameter. For example, this is useful
 # when using a Crawlera 'C10' plan which limits concurrency to 10. To deploy all
 # the workers concurrently, set the pool to be marginally larger than the number
 # of total workers assigned in groups in step #3 above.
-manager = BooksWorkGroupManager('books_scrape', stateful_book, groups=groups, pool=25)
+manager = BooksWorkGroupManager('books_scrape', tasks, groups=groups, pool=25)
 
 
 if __name__ == "__main__":
diff --git a/setup.py b/setup.py
@@ -123,7 +123,7 @@ def run(self):
     author_email=EMAIL,
     python_requires=REQUIRES_PYTHON,
     url=URL,
-    download_url='https://github.com/bomquote/transistor/archive/v0.2.0.tar.gz',
+    download_url='https://github.com/bomquote/transistor/archive/v0.2.1.tar.gz',
     keywords=['scraping', 'crawling', 'spiders', 'requests', 'beautifulsoup4',
               'mechanicalsoup', 'framework', 'headless-browser'],
     packages=find_packages(exclude=('tests',)),
diff --git a/tests/books_toscrape/test_books_toscrape.py b/tests/books_toscrape/test_books_toscrape.py
@@ -99,13 +99,14 @@ def bts_manager(_BooksToScrapeGroup, _BooksWorker):
     groups = [
         WorkGroup(
             name='books.toscrape.com',
+            url='http://books.toscrape.com/',
             spider=BooksToScrapeScraper,
             worker=_BooksWorker,
             items=BookItems,
             loader=BookItemsLoader,
             exporters=exporters,
             workers=3,  # this creates 3 scrapers and assigns each a book as a task
-            kwargs={'url': 'http://books.toscrape.com/', 'timeout': (3.0, 20.0)})
+            kwargs={'timeout': (3.0, 20.0)})
     ]
     manager = BooksWorkGroupManager('books_scrape', tasks, groups=groups, pool=5)
 
diff --git a/transistor/__version__.py b/transistor/__version__.py
@@ -12,7 +12,7 @@
 __title__ = 'transistor'
 __description__ = 'A web scraping framework for intelligent use cases.'
 __url__ = 'https://github.com/bomquote/transistor'
-__version__ = '0.2.0'
+__version__ = '0.2.1'
 __author__ = 'Bob Jordan'
 __author_email__ = 'bmjjr@bomquote.com'
 __license__ = 'MIT'
diff --git a/transistor/managers/base_manager.py b/transistor/managers/base_manager.py
@@ -99,6 +99,7 @@ def _init_workers(self):
                     # add the name to group.kwargs dict so it can be passed down
                     # to the group/worker/spider and assigned as an attr
                     group.kwargs['name'] = name
+                    group.kwargs['url'] = group.url
                     group.kwargs['spider'] = group.spider
                     group.kwargs['worker'] = group.worker
                     group.kwargs['items'] = group.items
diff --git a/transistor/persistence/exporters/base.py b/transistor/persistence/exporters/base.py
@@ -87,7 +87,7 @@ def _configure(self, options, dont_fail=False):
         (useful for using with keyword arguments in subclasses constructors)
         :return:
         """
-        self.encoding = options.pop('encoding', None)
+        self.encoding = options.pop('encoding', 'utf-8')
         self.fields_to_export = options.pop('fields_to_export', None)
         self.export_empty_fields = options.pop('export_empty_fields', False)
         self.indent = options.pop('indent', None)
diff --git a/transistor/persistence/exporters/json.py b/transistor/persistence/exporters/json.py
@@ -6,9 +6,8 @@
 a BaseWorker from a SplashScraper to JSON.
 
 Most of this module is heavily inspired or else copied from Scrapy. It has
-been modified to fit Transistor's API in requiring a scraper and items
-object. Also, Transistor only supports python 3. Otherwise, this module
-generally follows Scrapy's API and uses Scrapy's documentation.
+been modified to fit Transistor's API only supports python 3. Otherwise, this
+module generally follows Scrapy's API and uses Scrapy's documentation.
 
 :copyright: Original scrapy.exporters from scrapy==1.5.1 is
 Copyright by it's authors and further changes or contributions here are
@@ -24,7 +23,6 @@
 
 
 from .base import BaseItemExporter
-from transistor.persistence.item import Item
 from transistor.utility.python import to_bytes
 from transistor.utility.serialize import TransistorJSONEncoder
 
diff --git a/transistor/workers/workgroup.py b/transistor/workers/workgroup.py
@@ -40,6 +40,7 @@ class WorkGroup(NamedTuple):
     :param kwargs: to use for each <Worker> instance in the group
     """
     name: str
+    url : str
     # tasks: Optional[Type[Union[Type[StatefulBook], dict]]]
     spider: Type[SplashScraper]
     worker: Type[BaseWorker] = BaseWorker