Skip to content

Commit 9bdbd56

Browse files
Merge pull request #124 from jon-fox/feature/webscrapertool-error-status
feat(WebpageScraperTool): Add error field to output schema, update docs, and add tests #105
2 parents 14dfb43 + 9e80970 commit 9bdbd56

File tree

6 files changed

+282
-61
lines changed

6 files changed

+282
-61
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,7 @@ For more information on using and creating tools, see the [Atomic Forge README](
375375

376376
## Contributing
377377

378-
We welcome contributions! Please see the [Developer Guide](/guides/DEV_GUIDE.md) for detailed information on how to contribute to Atomic Agents. Here are some quick steps:
378+
We welcome contributions! Please see the [Contributing Guide](/docs/contributing.md) for detailed information on how to contribute to Atomic Agents. Here are some quick steps:
379379

380380
1. Fork the repository
381381
2. Create a new branch (`git checkout -b feature-branch`)

atomic-examples/deep-research/deep_research/tools/webpage_scraper.py

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ class WebpageScraperToolOutputSchema(BaseIOSchema):
4848

4949
content: str = Field(..., description="The scraped content in markdown format.")
5050
metadata: WebpageMetadata = Field(..., description="Metadata about the scraped webpage.")
51+
error: Optional[str] = Field(None, description="Error message if the scraping failed.")
5152

5253

5354
#################
@@ -214,38 +215,47 @@ def run(self, params: WebpageScraperToolInputSchema) -> WebpageScraperToolOutput
214215
Returns:
215216
WebpageScraperToolOutputSchema: The output containing the markdown content and metadata.
216217
"""
217-
# Fetch webpage content
218-
html_content = self._fetch_webpage(str(params.url))
219218

220-
# Parse HTML with BeautifulSoup
221-
soup = BeautifulSoup(html_content, "html.parser")
219+
try:
220+
# Fetch webpage content
221+
html_content = self._fetch_webpage(str(params.url))
222222

223-
# Extract main content using custom extraction
224-
main_content = self._extract_main_content(soup)
223+
# Parse HTML with BeautifulSoup
224+
soup = BeautifulSoup(html_content, "html.parser")
225225

226-
# Convert to markdown
227-
markdown_options = {
228-
"strip": ["script", "style"],
229-
"heading_style": "ATX",
230-
"bullets": "-",
231-
"wrap": True,
232-
}
226+
# Extract main content using custom extraction
227+
main_content = self._extract_main_content(soup)
233228

234-
if not params.include_links:
235-
markdown_options["strip"].append("a")
229+
# Convert to markdown
230+
markdown_options = {
231+
"strip": ["script", "style"],
232+
"heading_style": "ATX",
233+
"bullets": "-",
234+
"wrap": True,
235+
}
236236

237-
markdown_content = markdownify(main_content, **markdown_options)
237+
if not params.include_links:
238+
markdown_options["strip"].append("a")
238239

239-
# Clean up the markdown
240-
markdown_content = self._clean_markdown(markdown_content)
240+
markdown_content = markdownify(main_content, **markdown_options)
241241

242-
# Extract metadata
243-
metadata = self._extract_metadata(soup, Document(html_content), str(params.url))
242+
# Clean up the markdown
243+
markdown_content = self._clean_markdown(markdown_content)
244244

245-
return WebpageScraperToolOutputSchema(
246-
content=markdown_content,
247-
metadata=metadata,
248-
)
245+
# Extract metadata
246+
metadata = self._extract_metadata(soup, Document(html_content), str(params.url))
247+
248+
return WebpageScraperToolOutputSchema(
249+
content=markdown_content,
250+
metadata=metadata,
251+
)
252+
except Exception as e:
253+
# Create empty/minimal metadata with at least the domain
254+
domain = urlparse(str(params.url)).netloc
255+
minimal_metadata = WebpageMetadata(title="Error retrieving page", domain=domain)
256+
257+
# Return with error message in the error field
258+
return WebpageScraperToolOutputSchema(content="", metadata=minimal_metadata, error=str(e))
249259

250260

251261
#################
@@ -266,11 +276,16 @@ def run(self, params: WebpageScraperToolInputSchema) -> WebpageScraperToolOutput
266276
)
267277
)
268278

269-
console.print(Panel.fit("Metadata", style="bold green"))
270-
console.print(result.metadata.model_dump_json(indent=2))
279+
# Check if there was an error during scraping, otherwise print the results
280+
if result.error:
281+
console.print(Panel.fit("Error", style="bold red"))
282+
console.print(f"[red]{result.error}[/red]")
283+
else:
284+
console.print(Panel.fit("Metadata", style="bold green"))
285+
console.print(result.metadata.model_dump_json(indent=2))
271286

272-
console.print(Panel.fit("Content Preview (first 500 chars)", style="bold green"))
273-
console.print(result.content)
287+
console.print(Panel.fit("Content Preview (first 500 chars)", style="bold green"))
288+
console.print(result.content[:500] + ("..." if len(result.content) > 500 else ""))
274289

275290
except Exception as e:
276291
console.print(f"[red]Error:[/red] {str(e)}")

atomic-examples/web-search-agent/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ Set the `SEARXNG_BASE_URL` environment variable to `http://localhost:8080/` in y
8181

8282

8383
Note: for the agent to communicate with SearxNG, the instance must enable the JSON engine, which is disabled by default.
84-
Edit `searxng/settings.yml` and add `- json` in the `search.formats` section, then restart the container.
84+
Edit `/etc/searxng/settings.yml` and add `- json` in the `search.formats` section, then restart the container.
8585

8686

8787
## Customization
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
import os
2+
import sys
3+
import pytest
4+
from unittest.mock import patch, MagicMock
5+
6+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
7+
8+
from tool.webpage_scraper import ( # noqa: E402
9+
WebpageScraperTool,
10+
WebpageScraperToolInputSchema,
11+
WebpageScraperToolOutputSchema,
12+
WebpageScraperToolConfig,
13+
)
14+
15+
16+
@pytest.fixture
17+
def mock_requests_get():
18+
with patch("tool.webpage_scraper.requests.get") as mock_get:
19+
# Create mock response
20+
mock_response = MagicMock()
21+
mock_response.text = """
22+
<html>
23+
<head>
24+
<title>Test Page</title>
25+
<meta name="author" content="Test Author">
26+
<meta name="description" content="Test Description">
27+
<meta property="og:site_name" content="Test Site">
28+
</head>
29+
<body>
30+
<main>
31+
<h1>Test Heading</h1>
32+
<p>Test paragraph with <a href="https://example.com">link</a>.</p>
33+
</main>
34+
</body>
35+
</html>
36+
"""
37+
mock_response.content = mock_response.text.encode("utf-8")
38+
mock_response.status_code = 200
39+
mock_response.raise_for_status = MagicMock()
40+
41+
# Configure the mock
42+
mock_get.return_value = mock_response
43+
yield mock_get
44+
45+
46+
def test_webpage_scraper_tool_basic(mock_requests_get):
47+
# Initialize the tool
48+
scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
49+
input_schema = WebpageScraperToolInputSchema(url="https://example.com")
50+
51+
# Run the tool
52+
result = scraper_tool.run(input_schema)
53+
54+
# Assertions
55+
assert isinstance(result, WebpageScraperToolOutputSchema)
56+
assert "Test Heading" in result.content
57+
assert "Test paragraph" in result.content
58+
assert "link" in result.content
59+
assert result.metadata.title == "Test Page"
60+
assert result.metadata.author == "Test Author"
61+
assert result.metadata.description == "Test Description"
62+
assert result.metadata.site_name == "Test Site"
63+
assert result.metadata.domain == "example.com"
64+
assert result.error is None
65+
66+
67+
def test_webpage_scraper_tool_without_links(mock_requests_get):
68+
# Initialize the tool
69+
scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
70+
input_schema = WebpageScraperToolInputSchema(url="https://example.com", include_links=False)
71+
72+
# Run the tool
73+
result = scraper_tool.run(input_schema)
74+
75+
# Assertions
76+
assert isinstance(result, WebpageScraperToolOutputSchema)
77+
assert "Test paragraph with link" in result.content
78+
assert "https://example.com" not in result.content # Link URL should not be included
79+
80+
81+
def test_webpage_scraper_tool_http_error(mock_requests_get):
82+
# Configure mock to raise an exception
83+
mock_requests_get.return_value.raise_for_status.side_effect = Exception("404 Client Error")
84+
85+
# Initialize the tool
86+
scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
87+
input_schema = WebpageScraperToolInputSchema(url="https://example.com/not-found")
88+
89+
# Run the tool
90+
result = scraper_tool.run(input_schema)
91+
92+
# Assertions
93+
assert isinstance(result, WebpageScraperToolOutputSchema)
94+
assert result.content == "" # Content should be empty
95+
assert result.metadata.title == "Error retrieving page"
96+
assert result.metadata.domain == "example.com"
97+
assert "404 Client Error" in result.error
98+
99+
100+
def test_webpage_scraper_tool_content_too_large(mock_requests_get):
101+
# Configure mock content to exceed max length
102+
max_length = 1_000_000
103+
mock_requests_get.return_value.content = b"a" * (max_length + 1)
104+
105+
# Initialize the tool
106+
scraper_tool = WebpageScraperTool(WebpageScraperToolConfig(max_content_length=max_length))
107+
input_schema = WebpageScraperToolInputSchema(url="https://example.com/large-page")
108+
109+
# Run the tool
110+
result = scraper_tool.run(input_schema)
111+
112+
# Assertions
113+
assert isinstance(result, WebpageScraperToolOutputSchema)
114+
assert "exceeds maximum" in result.error
115+
116+
117+
def test_webpage_scraper_tool_extract_metadata():
118+
# Initialize the tool
119+
scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
120+
121+
# Create a minimal soup object with metadata
122+
soup = MagicMock()
123+
124+
# Create individual mock tags with get methods
125+
author_tag = MagicMock()
126+
author_tag.get.return_value = "Author Name"
127+
128+
description_tag = MagicMock()
129+
description_tag.get.return_value = "Page Description"
130+
131+
site_name_tag = MagicMock()
132+
site_name_tag.get.return_value = "Site Name"
133+
134+
# Configure find method to return the right mock based on arguments
135+
def mock_find(tag, attrs=None):
136+
if tag == "meta" and attrs == {"name": "author"}:
137+
return author_tag
138+
elif tag == "meta" and attrs == {"name": "description"}:
139+
return description_tag
140+
elif tag == "meta" and attrs == {"property": "og:site_name"}:
141+
return site_name_tag
142+
return None
143+
144+
soup.find.side_effect = mock_find
145+
146+
doc = MagicMock()
147+
doc.title.return_value = "Page Title"
148+
149+
# Call the method directly
150+
metadata = scraper_tool._extract_metadata(soup, doc, "https://example.org/page")
151+
152+
# Assertions
153+
assert metadata.title == "Page Title"
154+
assert metadata.author == "Author Name"
155+
assert metadata.description == "Page Description"
156+
assert metadata.site_name == "Site Name"
157+
assert metadata.domain == "example.org"
158+
159+
160+
def test_webpage_scraper_tool_clean_markdown():
161+
# Initialize the tool
162+
scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
163+
164+
# Input markdown with excess whitespace
165+
dirty_markdown = """
166+
# Title
167+
168+
169+
170+
Paragraph with trailing spaces
171+
172+
* List item 1
173+
* List item 2
174+
175+
176+
"""
177+
178+
# Clean the markdown
179+
cleaned = scraper_tool._clean_markdown(dirty_markdown)
180+
181+
# Assertions
182+
assert cleaned.count("\n\n\n") == 0 # No triple newlines
183+
assert "spaces \n" not in cleaned # No trailing spaces
184+
assert cleaned.endswith("\n") # Ends with newline
185+
186+
187+
if __name__ == "__main__":
188+
pytest.main([__file__])

0 commit comments

Comments
 (0)