Python-MiniScripts/Spider(bs4).py at master · redxmiton/Python-MiniScripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from bs4 import BeautifulSoup
from urllib.request import urlopen


def main():
    url = input('Enter the URL of the website to search: ')
    word = input('Enter the word(s) to search: ')
    maxPages = int(input('Enter the max no of pages to search: '))

    Links = [url]
    numberVisited, i = 0, 0
    foundWord = []

    # Run loop until the max number of pages are visited or no more links are there to visit
    while numberVisited < maxPages and i < len(Links):
        numberVisited = numberVisited + 1
        # Get the ith url in the Links list
        url = Links[i]
        i = i + 1
        try:
            print(numberVisited, 'Visiting: ', url)
            # Parse the html in the given url
            soup = BeautifulSoup(urlopen(url), 'html.parser')
            # Get the text in head and body
            head = soup.find('head')
            text = head.get_text()
            body = soup.find('body')
            text = text + '\n' + body.get_text()
            # Find the word in the text and if found, store the url
            if (text.find(word) != -1) and (url not in foundWord):
                foundWord.append(url)
            # Add all the urls on the page to the Links list
            Links.extend(page['href'] for page in soup.find_all('a', href=True) if page['href'] not in Links
                                                                                and page['href'].startswith(Links[0]))

            print('**Success!**')
        except:
            print('**Failed!**')

    if foundWord != []:
        print('\nThe word', word, 'was found at:')
        print(*foundWord, sep='\n')
    else:
        print('\nWord never found')


if __name__ == '__main__':
    main()