-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
101 lines (79 loc) · 3.13 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pyppeteer as pt
import asyncio
from pyppeteer.page import Page
from urllib.parse import urlparse
from bs4 import BeautifulSoup
Url = "https://www.concordia.ca/ginacody.html"
class Crawler:
def __init__(self, startUrl: str, limit: int):
self.visited = []
self.limit = limit
self.id = 0
self.startUrl = startUrl
self.blacklist = [
'[document]',
'noscript',
'header',
'html',
'meta',
'head',
'input',
'script',
'style',
'a',
]
async def crawl(self, page: Page, link: str):
try:
if (link.startswith('javascript')):
return
urlParse = urlparse(link)
if (urlParse.netloc != "www.concordia.ca"):
return
parsedLink = urlParse.scheme + "://" + urlParse.netloc + urlParse.path
self.visited.append(parsedLink)
response = await page.goto(parsedLink)
if (response.headers.get('content-type') != 'text/html'):
return
print("current page: " + parsedLink + ", id: " + str(self.id))
await page.waitFor(200)
await self.extractText(await page.content())
self.id += 1
self.limit -= 1
if (self.limit == 0):
return
anchors = await page.querySelectorAll('a')
newLinks: list[str] = []
for anchor in anchors:
newLink: str = await page.evaluate('(element) => element.href', anchor)
newLinks.append(newLink)
for newLink in newLinks:
newUrlParse = urlparse(newLink)
newParsedLink = newUrlParse.scheme + "://" + newUrlParse.netloc + newUrlParse.path
if newParsedLink in self.visited:
continue
await self.crawl(page, newParsedLink)
if (self.limit == 0):
return
except:
print("Error has occurred!")
self.limit = 0
async def extractText(self, html: str):
bs4 = BeautifulSoup(html, 'html.parser')
text = bs4.find_all(text=True)
output = ''
for t in text:
if (t.parent.name not in self.blacklist):
content = str('{}'.format(t))
output += ' '.join(content.split()) + ' '
try:
file = open('pages/' + str(self.id) + '.txt', 'w')
file.write(output)
file.close()
except:
print('cannot open file ' + str(self.id) + '.txt')
async def startCrawling(self):
browser = await pt.launch()
page = await browser.newPage()
await self.crawl(page, self.startUrl)
crawler = Crawler(Url, 90)
asyncio.get_event_loop().run_until_complete(crawler.startCrawling())