Merge pull request #124 from jon-fox/feature/webscrapertool-error-status

KennyVaneetvelde · web-flow · commit 9bdbd5608cd3 · 2025-05-20T10:58:51.000+02:00
feat(WebpageScraperTool): Add error field to output schema, update docs, and add tests #105
diff --git a/README.md b/README.md
@@ -375,7 +375,7 @@ For more information on using and creating tools, see the [Atomic Forge README](
 
 ## Contributing
 
-We welcome contributions! Please see the [Developer Guide](/guides/DEV_GUIDE.md) for detailed information on how to contribute to Atomic Agents. Here are some quick steps:
+We welcome contributions! Please see the [Contributing Guide](/docs/contributing.md) for detailed information on how to contribute to Atomic Agents. Here are some quick steps:
 
 1. Fork the repository
 2. Create a new branch (`git checkout -b feature-branch`)
diff --git a/atomic-examples/deep-research/deep_research/tools/webpage_scraper.py b/atomic-examples/deep-research/deep_research/tools/webpage_scraper.py
@@ -48,6 +48,7 @@ class WebpageScraperToolOutputSchema(BaseIOSchema):
 
     content: str = Field(..., description="The scraped content in markdown format.")
     metadata: WebpageMetadata = Field(..., description="Metadata about the scraped webpage.")
+    error: Optional[str] = Field(None, description="Error message if the scraping failed.")
 
 
 #################
@@ -214,38 +215,47 @@ def run(self, params: WebpageScraperToolInputSchema) -> WebpageScraperToolOutput
         Returns:
             WebpageScraperToolOutputSchema: The output containing the markdown content and metadata.
         """
-        # Fetch webpage content
-        html_content = self._fetch_webpage(str(params.url))
 
-        # Parse HTML with BeautifulSoup
-        soup = BeautifulSoup(html_content, "html.parser")
+        try:
+            # Fetch webpage content
+            html_content = self._fetch_webpage(str(params.url))
 
-        # Extract main content using custom extraction
-        main_content = self._extract_main_content(soup)
+            # Parse HTML with BeautifulSoup
+            soup = BeautifulSoup(html_content, "html.parser")
 
-        # Convert to markdown
-        markdown_options = {
-            "strip": ["script", "style"],
-            "heading_style": "ATX",
-            "bullets": "-",
-            "wrap": True,
-        }
+            # Extract main content using custom extraction
+            main_content = self._extract_main_content(soup)
 
-        if not params.include_links:
-            markdown_options["strip"].append("a")
+            # Convert to markdown
+            markdown_options = {
+                "strip": ["script", "style"],
+                "heading_style": "ATX",
+                "bullets": "-",
+                "wrap": True,
+            }
 
-        markdown_content = markdownify(main_content, **markdown_options)
+            if not params.include_links:
+                markdown_options["strip"].append("a")
 
-        # Clean up the markdown
-        markdown_content = self._clean_markdown(markdown_content)
+            markdown_content = markdownify(main_content, **markdown_options)
 
-        # Extract metadata
-        metadata = self._extract_metadata(soup, Document(html_content), str(params.url))
+            # Clean up the markdown
+            markdown_content = self._clean_markdown(markdown_content)
 
-        return WebpageScraperToolOutputSchema(
-            content=markdown_content,
-            metadata=metadata,
-        )
+            # Extract metadata
+            metadata = self._extract_metadata(soup, Document(html_content), str(params.url))
+
+            return WebpageScraperToolOutputSchema(
+                content=markdown_content,
+                metadata=metadata,
+            )
+        except Exception as e:
+            # Create empty/minimal metadata with at least the domain
+            domain = urlparse(str(params.url)).netloc
+            minimal_metadata = WebpageMetadata(title="Error retrieving page", domain=domain)
+
+            # Return with error message in the error field
+            return WebpageScraperToolOutputSchema(content="", metadata=minimal_metadata, error=str(e))
 
 
 #################
@@ -266,11 +276,16 @@ def run(self, params: WebpageScraperToolInputSchema) -> WebpageScraperToolOutput
             )
         )
 
-        console.print(Panel.fit("Metadata", style="bold green"))
-        console.print(result.metadata.model_dump_json(indent=2))
+        # Check if there was an error during scraping, otherwise print the results
+        if result.error:
+            console.print(Panel.fit("Error", style="bold red"))
+            console.print(f"[red]{result.error}[/red]")
+        else:
+            console.print(Panel.fit("Metadata", style="bold green"))
+            console.print(result.metadata.model_dump_json(indent=2))
 
-        console.print(Panel.fit("Content Preview (first 500 chars)", style="bold green"))
-        console.print(result.content)
+            console.print(Panel.fit("Content Preview (first 500 chars)", style="bold green"))
+            console.print(result.content[:500] + ("..." if len(result.content) > 500 else ""))
 
     except Exception as e:
         console.print(f"[red]Error:[/red] {str(e)}")
diff --git a/atomic-examples/web-search-agent/README.md b/atomic-examples/web-search-agent/README.md
@@ -81,7 +81,7 @@ Set the `SEARXNG_BASE_URL` environment variable to `http://localhost:8080/` in y
 
 
 Note: for the agent to communicate with SearxNG, the instance must enable the JSON engine, which is disabled by default.
-Edit `searxng/settings.yml` and add `- json` in the `search.formats` section, then restart the container.  
+Edit `/etc/searxng/settings.yml` and add `- json` in the `search.formats` section, then restart the container.
 
 
 ## Customization
diff --git a/atomic-forge/tools/webpage_scraper/tests/test_webpage_scraper.py b/atomic-forge/tools/webpage_scraper/tests/test_webpage_scraper.py
@@ -0,0 +1,188 @@
+import os
+import sys
+import pytest
+from unittest.mock import patch, MagicMock
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+
+from tool.webpage_scraper import (  # noqa: E402
+    WebpageScraperTool,
+    WebpageScraperToolInputSchema,
+    WebpageScraperToolOutputSchema,
+    WebpageScraperToolConfig,
+)
+
+
+@pytest.fixture
+def mock_requests_get():
+    with patch("tool.webpage_scraper.requests.get") as mock_get:
+        # Create mock response
+        mock_response = MagicMock()
+        mock_response.text = """
+        <html>
+            <head>
+                <title>Test Page</title>
+                <meta name="author" content="Test Author">
+                <meta name="description" content="Test Description">
+                <meta property="og:site_name" content="Test Site">
+            </head>
+            <body>
+                <main>
+                    <h1>Test Heading</h1>
+                    <p>Test paragraph with <a href="https://example.com">link</a>.</p>
+                </main>
+            </body>
+        </html>
+        """
+        mock_response.content = mock_response.text.encode("utf-8")
+        mock_response.status_code = 200
+        mock_response.raise_for_status = MagicMock()
+
+        # Configure the mock
+        mock_get.return_value = mock_response
+        yield mock_get
+
+
+def test_webpage_scraper_tool_basic(mock_requests_get):
+    # Initialize the tool
+    scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
+    input_schema = WebpageScraperToolInputSchema(url="https://example.com")
+
+    # Run the tool
+    result = scraper_tool.run(input_schema)
+
+    # Assertions
+    assert isinstance(result, WebpageScraperToolOutputSchema)
+    assert "Test Heading" in result.content
+    assert "Test paragraph" in result.content
+    assert "link" in result.content
+    assert result.metadata.title == "Test Page"
+    assert result.metadata.author == "Test Author"
+    assert result.metadata.description == "Test Description"
+    assert result.metadata.site_name == "Test Site"
+    assert result.metadata.domain == "example.com"
+    assert result.error is None
+
+
+def test_webpage_scraper_tool_without_links(mock_requests_get):
+    # Initialize the tool
+    scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
+    input_schema = WebpageScraperToolInputSchema(url="https://example.com", include_links=False)
+
+    # Run the tool
+    result = scraper_tool.run(input_schema)
+
+    # Assertions
+    assert isinstance(result, WebpageScraperToolOutputSchema)
+    assert "Test paragraph with link" in result.content
+    assert "https://example.com" not in result.content  # Link URL should not be included
+
+
+def test_webpage_scraper_tool_http_error(mock_requests_get):
+    # Configure mock to raise an exception
+    mock_requests_get.return_value.raise_for_status.side_effect = Exception("404 Client Error")
+
+    # Initialize the tool
+    scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
+    input_schema = WebpageScraperToolInputSchema(url="https://example.com/not-found")
+
+    # Run the tool
+    result = scraper_tool.run(input_schema)
+
+    # Assertions
+    assert isinstance(result, WebpageScraperToolOutputSchema)
+    assert result.content == ""  # Content should be empty
+    assert result.metadata.title == "Error retrieving page"
+    assert result.metadata.domain == "example.com"
+    assert "404 Client Error" in result.error
+
+
+def test_webpage_scraper_tool_content_too_large(mock_requests_get):
+    # Configure mock content to exceed max length
+    max_length = 1_000_000
+    mock_requests_get.return_value.content = b"a" * (max_length + 1)
+
+    # Initialize the tool
+    scraper_tool = WebpageScraperTool(WebpageScraperToolConfig(max_content_length=max_length))
+    input_schema = WebpageScraperToolInputSchema(url="https://example.com/large-page")
+
+    # Run the tool
+    result = scraper_tool.run(input_schema)
+
+    # Assertions
+    assert isinstance(result, WebpageScraperToolOutputSchema)
+    assert "exceeds maximum" in result.error
+
+
+def test_webpage_scraper_tool_extract_metadata():
+    # Initialize the tool
+    scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
+
+    # Create a minimal soup object with metadata
+    soup = MagicMock()
+
+    # Create individual mock tags with get methods
+    author_tag = MagicMock()
+    author_tag.get.return_value = "Author Name"
+
+    description_tag = MagicMock()
+    description_tag.get.return_value = "Page Description"
+
+    site_name_tag = MagicMock()
+    site_name_tag.get.return_value = "Site Name"
+
+    # Configure find method to return the right mock based on arguments
+    def mock_find(tag, attrs=None):
+        if tag == "meta" and attrs == {"name": "author"}:
+            return author_tag
+        elif tag == "meta" and attrs == {"name": "description"}:
+            return description_tag
+        elif tag == "meta" and attrs == {"property": "og:site_name"}:
+            return site_name_tag
+        return None
+
+    soup.find.side_effect = mock_find
+
+    doc = MagicMock()
+    doc.title.return_value = "Page Title"
+
+    # Call the method directly
+    metadata = scraper_tool._extract_metadata(soup, doc, "https://example.org/page")
+
+    # Assertions
+    assert metadata.title == "Page Title"
+    assert metadata.author == "Author Name"
+    assert metadata.description == "Page Description"
+    assert metadata.site_name == "Site Name"
+    assert metadata.domain == "example.org"
+
+
+def test_webpage_scraper_tool_clean_markdown():
+    # Initialize the tool
+    scraper_tool = WebpageScraperTool(WebpageScraperToolConfig())
+
+    # Input markdown with excess whitespace
+    dirty_markdown = """
+    # Title
+
+
+
+    Paragraph with trailing spaces
+
+    * List item 1
+    * List item 2
+
+
+    """
+
+    # Clean the markdown
+    cleaned = scraper_tool._clean_markdown(dirty_markdown)
+
+    # Assertions
+    assert cleaned.count("\n\n\n") == 0  # No triple newlines
+    assert "spaces    \n" not in cleaned  # No trailing spaces
+    assert cleaned.endswith("\n")  # Ends with newline
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/atomic-forge/tools/webpage_scraper/tool/webpage_scraper.py b/atomic-forge/tools/webpage_scraper/tool/webpage_scraper.py
diff --git a/guides/DEV_GUIDE.md b/guides/DEV_GUIDE.md