bomquote
diff --git a/‎CHANGES‎
Lines changed: 7 additions & 0 deletions b/‎CHANGES‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎README.rst‎
Lines changed: 2 additions & 1 deletion b/‎README.rst‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎appveyor.yml‎
Lines changed: 1 addition & 0 deletions b/‎appveyor.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎appveyor/rabbitmq.ps1‎
Lines changed: 1 addition & 1 deletion b/‎appveyor/rabbitmq.ps1‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎appveyor/redis.ps1‎
Lines changed: 1 addition & 1 deletion b/‎appveyor/redis.ps1‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/books_to_scrape/manager.py‎
Lines changed: 3 additions & 3 deletions b/‎examples/books_to_scrape/manager.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/books_to_scrape/schedulers/brokers/client_main.py‎
Lines changed: 71 additions & 9 deletions b/‎examples/books_to_scrape/schedulers/brokers/client_main.py‎
Lines changed: 71 additions & 9 deletions
diff --git a/‎examples/books_to_scrape/schedulers/brokers/worker_main.py‎
Lines changed: 28 additions & 13 deletions b/‎examples/books_to_scrape/schedulers/brokers/worker_main.py‎
Lines changed: 28 additions & 13 deletions
diff --git a/‎examples/books_to_scrape/schedulers/stateful_book/main.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/books_to_scrape/schedulers/stateful_book/main.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/books_to_scrape/workgroup.py‎
Lines changed: 8 additions & 8 deletions b/‎examples/books_to_scrape/workgroup.py‎
Lines changed: 8 additions & 8 deletions
@@ -2,6 +2,13 @@
   transistor: CHANGES
 ========================
 
+12/3/18
+- Fixed a bug in BaseWorker.load_items() method which previously resulted
+in losing scrape data when the number of workers did not equal number of
+tasks. Now, using any number of workers or pool size will result in
+consistent export/save results. While, scrape time will change proportional
+to the number of workers assigned. Wrote tests to ensure the same.
+
 11/30/18
 - BaseGroup.hired_worker() method name has been changed to `get_worker()`
 
 
@@ -54,7 +54,7 @@ Development of Transistor is sponsored by `BOM Quote Manufacturing <https://www.
 4. Easily integrate within a web app like `Flask <https://github.com/pallets/flask>`_, `Django <https://github.com/django/django>`_ , or other python based `web frameworks <https://github.com/vinta/awesome-python#web-frameworks>`_.
 5. Provide spreadsheet based data ingest and export options, like import a list of search terms from excel, ods, csv, and export data to each as well.
 6. Utilize quick and easy integrated task work queues which can be automatically filled with data search terms by a simple spreadsheet import.
-7. Able to integrate with more robust task queues like `Celery <https://github.com/celery/celery>`_ while using `rabbitmq <https://www.rabbitmq.com/>`_ or `redis <https://redis.io/>`_ as needed.
+7. Able to integrate with more robust task queues like `Celery <https://github.com/celery/celery>`_ while using `rabbitmq <https://www.rabbitmq.com/>`_ or `redis <https://redis.io/>`_ as a message broker as desired.
 8. Provide hooks for users to persist data via any method they choose, while also supporting our own opinionated choice which is a `PostgreSQL <https://www.postgresql.org/>`_ database along with `newt.db <https://github.com/newtdb/db>`_.
 9. Contain useful abstractions, classes, and interfaces for scraping and crawling with machine learning assistance (wip, timeline tbd).
 10. Further support data science use cases of the persisted data, where convenient and useful for us to provide in this library (wip, timeline tbd).
@@ -485,6 +485,7 @@ Transistor provides useful layers and objects in the following categories:
 
 - see ``transistor/schedulers/brokers``
 - provides the ``ExchangeQueue`` class in transistor.scheulers.brokers.queues which can be passed to the ``tasks`` parameter of ``BaseWorkGroupManager``
+- Just pass the appropriate connection string to ``ExchangeQueue`` and ``BaseWorkGroupManager`` and you can use either RabbitMQ or Redis as a message broker, thanks to `kombu <https://github.com/celery/kombu>`_.
 - in this case, the ``BaseWorkGroupManager`` also acts as a AMQP ``consumer`` which can receive messages from RabbitMQ message broker
 
 
 
@@ -120,6 +120,7 @@ install:
     - cmd: pip install pytest==4.0.1
     - cmd: pip install pytest-cov==2.6.0
     - cmd: pip install coverage==4.5.2
+    - cmd: pip install cssselect==1.0.3
     - cmd: pip install mock==2.0.0
     - cmd: pip install gevent==1.3.7
     - cmd: pip install newt.db==0.9.0
 
@@ -1,7 +1,7 @@
 # """
 # transistor.appveyor.rabbitmq
 # ~~~~~~~~~~~~
-# This is a powershell script to install rabbitmq in a windows server environment.
+# This is a powershell script to install rabbitmq on Windows.
 # It is included here to facilitate various test cases in appveyor CI.
 #
 # :copyright: Copyright (C) 2018 by BOM Quote Limited
 
@@ -1,7 +1,7 @@
 # """
 # transistor.appveyor.redis
 # ~~~~~~~~~~~~
-# This is a powershell script to install redis in a windows server environment.
+# This is a powershell script to install Redis on Windows.
 # It is included here to facilitate various test cases in appveyor CI.
 #
 # :copyright: Copyright (C) 2018 by BOM Quote Limited
 
@@ -12,7 +12,7 @@
 """
 import gevent
 from transistor import BaseWorkGroupManager
-
+from transistor.utility.logging import logger
 
 class BooksWorkGroupManager(BaseWorkGroupManager):
     """
@@ -36,7 +36,7 @@ def monitor(self, target):
         :param target: the target parameter here is a <Worker()> class object and
         you must call target.spawn_scraper() to start the Worker.
         """
-        print(f'spawning {target}')
+        logger.info(f'spawning {target}')
         target.spawn_spider()  # this must be called. It is, required.
         # Calling spawn_scraper() above instructs the Worker object to start
         # the scrape.So there will be some wait period at this point for each
@@ -49,7 +49,7 @@ def monitor(self, target):
             # here, event represents returned scraper objects which the worker has
             # completed. We can iterate through the event objects and, for example,
             # apply some data transformation, delete failed scrapes, or save data
-            print(f'THIS IS A MONITOR EVENT - > {event}')
+            logger.info(f'THIS IS A MONITOR EVENT - > {event}')
         # This last line is required, ensure the below gevent.sleep(0) remains.
         gevent.sleep(0)
 
 
@@ -5,30 +5,92 @@
 This module implements a client producer for testing
 and example.
 
+To run this example, first run:
+
+>>> python client_worker.py
+
+This will start the worker and await the task. Then, in a separate
+command prompt, to simulate a message sent to the broker queue, run:
+
+>>> python client_main.py
+
+The result should be the worker will process the `keywords` tasks.
+
 :copyright: Copyright (C) 2018 by BOM Quote Limited
 :license: The MIT License, see LICENSE for more details.
 ~~~~~~~~~~~~
 """
-
+import time
+from kombu import Connection
 from kombu.pools import producers
-from examples.books_to_scrape.schedulers.brokers.worker_main import tasks
+from transistor.schedulers.brokers.queues import ExchangeQueue
+from transistor.utility.logging import logger
+# from examples.books_to_scrape.schedulers.brokers.worker_main import tasks
 
-task_exchange = tasks.task_exchange
+trackers = ['books.toscrape.com']
+tasks = ExchangeQueue(trackers)
+connection = Connection("pyamqp://guest:guest@localhost:5672//")
 
+def _publish(producer, payload, routing_key, exchange):
+    """
 
-def send_as_task(connection, keywords, kwargs={}):
+    :param producer: example ->
+        >>> with producers[connection].acquire(block=True) as producer:
+    :param payload: example ->
+        >>> payload = {'keywords': keywords, 'kwargs': kwargs}
+    :param routing_key: Type[str]: 'books.toscrape.com'
+    :param exchange: a kombu Type[Exchange] class object
+    :return:
+    """
+    producer.publish(payload,
+                     serializer='json',
+                     exchange=exchange,
+                     routing_key=routing_key,
+                     declare=[exchange],
+                     retry=True,
+                     retry_policy={
+                         'interval_start': 0,  # First retry immediately,
+                         'interval_step': 2,  # then increase by 2s for every retry.
+                         'interval_max': 5,  # don't exceed 5s between retries.
+                         'max_retries': 3,  # give up after 3 tries.
+                     })
+
+
+def send_as_task(connection, keywords, routing_key, exchange, kwargs={}):
     payload = {'keywords': keywords, 'kwargs': kwargs}
 
     with producers[connection].acquire(block=True) as producer:
+        # for tracker in tasks.trackers:
+        #    publish(producer=producer, payload=payload, routing_key=tracker)
         producer.publish(payload,
                          serializer='json',
-                         exchange=task_exchange,
-                         declare=[task_exchange])
+                         # if there is more than one tracker, use something like
+                         # the _publish above, with a for loop for each tracker
+                         routing_key=routing_key,
+                         exchange=exchange,
+                         declare=[exchange],
+                         )
 
 
 if __name__ == '__main__':
 
     from kombu import Connection
-    keywords = '["Soumission", "Rip it Up and Start Again", "Black Dust"]'
-    connection = Connection("pyamqp://guest:guest@localhost:5672//")
-    send_as_task(connection, keywords=keywords, kwargs={})
+    from kombu.utils.debug import setup_logging
+
+    # setup root logger
+    setup_logging(loglevel='INFO', loggers=[''])
+
+    keyword_1 = '["Soumission"]'
+    keyword_2 = '["Rip it Up and Start Again"]'
+    keywords = '["Black Dust", "When We Collided"]'
+
+    with Connection("pyamqp://guest:guest@localhost:5672//") as conn:
+        send_as_task(conn, keywords=keyword_1, routing_key='books.toscrape.com',
+                     exchange= tasks.task_exchange, kwargs={})
+        logger.info(f'sent task {keyword_1}')
+        send_as_task(conn, keywords=keyword_2, routing_key='books.toscrape.com',
+                     exchange= tasks.task_exchange, kwargs={})
+        logger.info(f'sent task {keyword_2}')
+        send_as_task(conn, keywords=keywords, routing_key='books.toscrape.com',
+                     exchange= tasks.task_exchange, kwargs={})
+        logger.info(f'sent task {keywords}')
@@ -4,6 +4,17 @@
 ~~~~~~~~~~~~
 Entry point to run the books_to_scrape example.
 
+To run this example, first run:
+
+>>> python client_main.py
+
+This will start the producer and send the tasks to the broker Exchange queue.
+Then, in a separate command prompt, run:
+
+>>> python client_worker.py
+
+The result should be the worker will process the `keywords` tasks.
+
 Note:
 
 The primary use case where the current Transistor design shines, is when you need to
@@ -15,9 +26,10 @@
 of workers to the search page. Each worker has one task issued, a task to execute the
 search for the term it has been assigned, and return to us with the response.
 
-The example highlighted here, employs a "crawl" mechanism inside each
-of the scraper objects. This is not really showcasing the optimal use case for
-a Transistor SplashScraper with Manager/WorkGroups, per the current design.
+The example highlighted here in `books_to_scrape`, employs a "crawl" mechanism
+inside each of the scraper objects. This is not really showcasing the optimal
+use case for a Transistor SplashScraper with Manager/WorkGroups, per the
+current design.
 
 The reason is, in this example, we send out 20 workers at once. Each worker
 crawls through EACH PAGE on the books.toscrape.com website, until the worker finds
@@ -58,10 +70,11 @@
 from examples.books_to_scrape.manager import BooksWorkGroupManager
 from examples.books_to_scrape.persistence.serialization import (
     BookItems, BookItemsLoader)
+from transistor.utility.logging import logger
 
 
-# 1) Create a FanoutTask instance and connection object to prepare to use
-# RabbitMQ message broker.
+# 1) Create an ExchangeQueue instance and connection object to prepare
+# to use RabbitMQ message broker.
 # Set a list of tracker names, with one tracker name for each WorkGroup you create
 # in step three. Ensure the tracker name matches the WorkGroup.name in step four.
 
@@ -98,17 +111,17 @@
         items=BookItems,
         loader=BookItemsLoader,
         exporters=exporters,
-        workers=3,  # this creates 3 scrapers and assigns each a book as a task
-        kwargs={'timeout': (3.0, 20.0), 'qtimeout': 10})
+        workers=2,  # this creates x scrapers and assigns each a book as a task
+        kwargs={'timeout': (3.0, 20.0)})
     ]
 
 # 4) Last, setup the Manager. You can constrain the number of workers actually
 # deployed, through the `pool` parameter. For example, this is useful
 # when using a Crawlera 'C10' plan which limits concurrency to 10. To deploy all
 # the workers concurrently, set the pool to be marginally larger than the number
 # of total workers assigned in groups in step #3 above.
-manager = BooksWorkGroupManager('books_scrape', tasks, workgroups=groups, pool=5,
-                                connection=connection, should_stop=True)
+manager = BooksWorkGroupManager('books_scrape', tasks, workgroups=groups, pool=10,
+                                connection=connection)
 
 if __name__ == "__main__":
 
@@ -117,13 +130,15 @@
     setup_logging(loglevel='INFO', loggers=[''])
     with Connection('amqp://guest:guest@localhost:5672//') as conn:
         try:
+
             manager.main()  # call manager.main() to start the job.
         except KeyboardInterrupt:
             print('bye bye')
     # below shows an example of navigating your persisted data after the scrape
 
     result = get_job_results(ndb, 'books_scrape')
-    print(f'Printing: books_scrape result')
-    for r in result:
-        print(f"{r['book_title']}, {r['price']}, {r['stock']}")
-    delete_job(ndb, 'books_scrape')
+    logger.info(f'Printing: books_scrape result =>')
+    if result:
+        for r in result:
+            logger.info(f"{r['book_title']}, {r['price']}, {r['stock']}")
+        delete_job(ndb, 'books_scrape')
@@ -116,8 +116,8 @@ def get_file_path(filename):
 # 5) Last, setup the Manager. You can constrain the number of workers actually
 # deployed, through the `pool` parameter. For example, this is useful
 # when using a Crawlera 'C10' plan which limits concurrency to 10. To deploy all
-# the workers concurrently, set the pool to be marginally larger than the number
-# of total workers assigned in groups in step #3 above.
+# the workers concurrently, set the pool +1 higher than the number of total
+# workers assigned in groups in step #3 above. The +1 is for pool manager.
 manager = BooksWorkGroupManager('books_scrape', tasks, workgroups=groups, pool=5)
 
 
 
@@ -12,7 +12,7 @@
 from transistor import BaseWorker
 from examples.books_to_scrape.persistence import ndb
 from transistor.persistence.newt_db.collections import SpiderList
-
+from transistor.utility.logging import logger
 
 class BooksWorker(BaseWorker):
     """
@@ -44,8 +44,8 @@ def pre_process_exports(self, spider, task):
             try:
                 # create the list with the job name if it doesnt already exist
                 ndb.root.spiders.add(self.job_id, SpiderList())
-                print(f'Worker {self.name}-{self.number} created a new scrape_list for '
-                      f'{self.job_id}')
+                logger.info(f'Worker {self.name}-{self.number} created a new spider '
+                            f'list for {self.job_id}')
             except KeyError:
                 # will be raised if there is already a list with the same job_name
                 pass
@@ -54,11 +54,11 @@ def pre_process_exports(self, spider, task):
             # save the items object to newt.db
             ndb.root.spiders[self.job_id].add(items)
             ndb.commit()
-            print(f'Worker {self.name}-{self.number} saved {items.__repr__()} to '
+            logger.info(f'Worker {self.name}-{self.number} saved {items.__repr__()} to '
                   f'scrape_list "{self.job_id}" for task {task}.')
         else:
             # if job_id is NONE then we'll skip saving the objects
-            print(f'Worker {self.name}-{self.number} said job_name is {self.job_id} '
+            logger.info(f'Worker {self.name}-{self.number} said job_name is {self.job_id} '
                   f'so will not save it.')
 
     def post_process_exports(self, spider, task):
@@ -70,6 +70,6 @@ class attribute called `events`.
 
         """
         self.events.append(spider)
-        print(f'{self.name} has {spider.stock} inventory status.')
-        print(f'pricing: {spider.price}')
-        print(f'Worker {self.name}-{self.number} finished task {task}')
+        logger.info(f'{self.name} has {spider.stock} inventory status.')
+        logger.info(f'pricing: {spider.price}')
+        logger.info(f'Worker {self.name}-{self.number} finished task {task}')
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`# """`
`2`	`2`	`# transistor.appveyor.rabbitmq`
`3`	`3`	`# ~~~~~~~~~~~~`
`4`		`-# This is a powershell script to install rabbitmq in a windows server environment.`
	`4`	`+# This is a powershell script to install rabbitmq on Windows.`
`5`	`5`	`# It is included here to facilitate various test cases in appveyor CI.`
`6`	`6`	`#`
`7`	`7`	`# :copyright: Copyright (C) 2018 by BOM Quote Limited`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`# """`
`2`	`2`	`# transistor.appveyor.redis`
`3`	`3`	`# ~~~~~~~~~~~~`
`4`		`-# This is a powershell script to install redis in a windows server environment.`
	`4`	`+# This is a powershell script to install Redis on Windows.`
`5`	`5`	`# It is included here to facilitate various test cases in appveyor CI.`
`6`	`6`	`#`
`7`	`7`	`# :copyright: Copyright (C) 2018 by BOM Quote Limited`