-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbfs.py
88 lines (74 loc) · 2.43 KB
/
bfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import requests as rq
from collections import deque
from typing import List
from bs4 import BeautifulSoup
API = "https://ru.wikipedia.org"
def get_link(url):
resp = rq.get(url)
if resp.status_code != 200:
print("Can't fetch data!")
exit(0)
soup = BeautifulSoup(resp.content, "html5lib")
urls = []
soup = soup.find("div", {"id": "content"})
for links in soup.find_all("a"):
link = str(links.get("href"))
if link.startswith("/wiki/"):
main = API + link
urls.append(main)
return urls
def bfs(start, end):
visited = set()
queue = deque([(start, [start])])
while queue:
url, path = queue.popleft()
if url == end:
return path
if url in visited:
continue
visited.add(url)
for link in get_link(url):
if link not in visited:
queue.append((link, path + [link]))
if len(queue) == 0:
return None
def find_paragraph(src, url):
resp = rq.get(src)
if resp.status_code != 200:
print("Can't fetch data!")
exit(0)
# finding all div with content 'id'
soup = BeautifulSoup(resp.content, "html.parser")
texts = []
paragraphs = soup.find_all("p")
for x in paragraphs:
for p in x.find_all("a"):
our_url = p.get("href")
if API + our_url == url:
texts.append(x.get_text() + f" {url}")
return texts
def main():
URL1 = str(
input(
"Type src url, remember, that I work only 'https://ru.wikipedia.org' type of URL's: \n"
)
)
URL2 = str(input("Type destination url: \n"))
all_urls = bfs(URL1, URL2)
if all_urls is not None:
if len(all_urls) == 1:
print("Something wrong, maybe your destination and src same links")
# from all_urls we got list of urls, for example [url1, url2, url3]
# it is mean, there are url2 in somewhere in url1, and so one, so one
# we can say that, there are n url(site) in n-1 site
if len(all_urls) == 2:
src, dest = all_urls[0], all_urls[-1]
text = find_paragraph(src, dest)
print(text[0])
if len(all_urls) > 2:
for x in range(1, len(all_urls) - 1):
src, dest = all_urls[x - 1], all_urls[x]
text = find_paragraph(src, dest)
for t in text:
print(t)
main()