- High Speed WebCrawler built on EventMachine.
- Supports databases engines like Postgre, Mysql, Oracle, Sqlite.
- Command line tools.
- Extract data using XPath.
- Cookie Handlers.
""" models.rb """
require 'rubygems'
require 'data_mapper'
require 'dm-migrations'
class Package
include DataMapper::Resource
property :updated, String
property :package, String
property :description, String
end
""" crawlers.rb """
require 'crawlers'
require 'scrapers'
class PypiScraper < BaseScraper
@@matching_urls = ["%pypi.python.org/pypi%"]
def scrape response
super response
end
end
class PypiCrawler < BaseCrawler
#add your starting urls here
@@start_urls = ["http://pypi.python.org/pypi"]
#add your scraper classes here
@@scrapers = [PypiScraper.new]
#specify your maximum crawling depth level
@@max_depth = 1
end