forked from DishantK1807/Python-MiniScripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSpider(bs4).py
More file actions
48 lines (40 loc) · 1.68 KB
/
Spider(bs4).py
File metadata and controls
48 lines (40 loc) · 1.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from bs4 import BeautifulSoup
from urllib.request import urlopen
def main():
url = input('Enter the URL of the website to search: ')
word = input('Enter the word(s) to search: ')
maxPages = int(input('Enter the max no of pages to search: '))
Links = [url]
numberVisited, i = 0, 0
foundWord = []
# Run loop until the max number of pages are visited or no more links are there to visit
while numberVisited < maxPages and i < len(Links):
numberVisited = numberVisited + 1
# Get the ith url in the Links list
url = Links[i]
i = i + 1
try:
print(numberVisited, 'Visiting: ', url)
# Parse the html in the given url
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Get the text in head and body
head = soup.find('head')
text = head.get_text()
body = soup.find('body')
text = text + '\n' + body.get_text()
# Find the word in the text and if found, store the url
if (text.find(word) != -1) and (url not in foundWord):
foundWord.append(url)
# Add all the urls on the page to the Links list
Links.extend(page['href'] for page in soup.find_all('a', href=True) if page['href'] not in Links
and page['href'].startswith(Links[0]))
print('**Success!**')
except:
print('**Failed!**')
if foundWord != []:
print('\nThe word', word, 'was found at:')
print(*foundWord, sep='\n')
else:
print('\nWord never found')
if __name__ == '__main__':
main()