-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
58 lines (43 loc) · 1.46 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import repo
import logging
async def crawl(url, get_html, extract_content):
# TO-DO: Error handling
if not url:
logging.info('URL not provided', url)
return 'URL not provided'
repo.add_to_queue(url)
content = await _crawl(url, get_html, extract_content)
repo.move_from_queued_to_visited(url)
logging.info(f"Content extracted in main crawl: {content}")
return content
def add_results_to_queue(urls, allow_url_filter):
# TO-DO: Error handling
if not urls:
return
for url in urls:
if allow_url_filter(url):
print('Add URL to visit queue', url)
repo.add_to_visit(url)
async def _crawl(url, get_html, extract_content):
# TO-DO: Error handling
print('Crawl ->', url)
html = await get_html(url)
logging.info("Html returned.")
soup = BeautifulSoup(html, 'html.parser')
logging.info(f'Soup returned.')
# links = _extract_links(url, soup)
content = extract_content(url, soup)
logging.info(f"Content extracted in _crawl: {content}")
return content
def _extract_links(url, soup):
# TO-DO: Error handling
return list({
urljoin(url, a.get('href'))
for a in soup.find_all('a')
if a.get('href') and not(a.get('rel') and 'nofollow' in a.get('rel'))
})
def _seen(url):
# TO-DO: Error handling
return repo.is_visited(url) or repo.is_queued(url)