-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclinic.py
73 lines (61 loc) · 2.92 KB
/
clinic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import requests
from bs4 import BeautifulSoup
def scrape_navbar_categories(url):
try:
# Send a GET request to the website
response = requests.get(url)
response.raise_for_status() # Raise an error for bad HTTP status codes
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Find the main navigation bar
nav_menu = soup.select_one('ul.nav_menu.flex')
if not nav_menu:
print("Navigation menu not found.")
return []
# Recursive function to extract categories and subcategories
def extract_categories(menu):
categories = []
for item in menu.find_all('li', recursive=False):
link_tag = item.select_one('a')
category_name = link_tag.get_text(strip=True)
category_link = link_tag['href']
sub_menu = item.select_one('ul')
sub_categories = extract_categories(sub_menu) if sub_menu else []
categories.append({"name": category_name, "link": category_link, "subcategories": sub_categories})
# If it's an end branch, print the link and extract main content
if not sub_menu:
print(f"End branch link: {category_link}")
extract_main_content(category_link, category_name)
return categories
def extract_main_content(link, category_name):
try:
response = requests.get(link)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
main_content = soup.select_one('div.main.content.wrapper')
if main_content:
content_text = main_content.get_text(strip=True)
print(f"Main content from {link}:\n{content_text}")
save_content(category_name, content_text)
else:
print(f"No main content found for {link}")
except requests.exceptions.RequestException as e:
print(f"An error occurred while fetching the URL {link}: {e}")
def save_content(category_name, content):
filename = "./result/" + category_name.replace(" ", "_").replace("/", "_") + ".txt"
with open(filename, 'w', encoding='utf-8') as file:
file.write(content)
print(f"Content saved to {filename}")
# Extract categories from the navigation menu
categories = extract_categories(nav_menu)
return categories
except requests.exceptions.RequestException as e:
print(f"An error occurred while fetching the URL: {e}")
return []
# URL of the website
url = "https://www.clarisclinic.com/"
# Scrape the categories
categories = scrape_navbar_categories(url)
# Print the categories
import json
print(json.dumps(categories, indent=2))