-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathweb_to_md.py
105 lines (86 loc) · 3.08 KB
/
web_to_md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import os
import sys
import requests
from bs4 import BeautifulSoup
import html2text
from openai import OpenAI
from dotenv import load_dotenv
from datetime import datetime
from urllib.parse import urlparse
# Load environment variables
load_dotenv()
def get_webpage(url):
"""Fetch webpage content."""
try:
response = requests.get(url)
response.raise_for_status()
return response.text
except requests.RequestException as e:
print(f"Error fetching webpage: {e}")
sys.exit(1)
def clean_html(html_content):
"""Clean HTML using BeautifulSoup."""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get text
return str(soup)
def html_to_markdown(html_content):
"""Convert HTML to markdown."""
converter = html2text.HTML2Text()
converter.ignore_links = False
converter.ignore_images = False
converter.ignore_tables = False
return converter.handle(html_content)
def enhance_markdown_with_gpt(markdown_content, url):
"""Enhance markdown content using GPT-4."""
client = OpenAI()
prompt = f"""Please improve this markdown content from {url}.
Make it more readable and well-formatted while preserving all important information.
Remove any unnecessary content like navigation menus, footers, or ads.
Ensure proper heading hierarchy and clean formatting.
Content:
{markdown_content}
"""
try:
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a markdown formatting expert. Convert web content to clean, well-structured markdown."},
{"role": "user", "content": prompt}
],
max_tokens=8192,
temperature=0.3
)
return response.choices[0].message.content
except Exception as e:
print(f"Error with OpenAI API: {e}")
return markdown_content
def save_markdown(markdown_content, url):
"""Save markdown content to file."""
# Create output directory if it doesn't exist
os.makedirs('output', exist_ok=True)
# Create filename from URL and timestamp
domain = urlparse(url).netloc
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"output/{domain}_{timestamp}.md"
with open(filename, 'w', encoding='utf-8') as f:
f.write(markdown_content)
return filename
def main():
if len(sys.argv) != 2:
print("Usage: python web_to_md.py <url>")
sys.exit(1)
url = sys.argv[1]
print(f"Processing {url}...")
# Get and process webpage
html_content = get_webpage(url)
clean_content = clean_html(html_content)
markdown_content = html_to_markdown(clean_content)
enhanced_markdown = enhance_markdown_with_gpt(markdown_content, url)
# Save result
output_file = save_markdown(enhanced_markdown, url)
print(f"Markdown saved to: {output_file}")
if __name__ == "__main__":
main()