fix: deep crawl duplicate url processing

stevenh · stevenh · commit 3a1e8722c8ab · 2025-04-16T21:32:39.000+01:00
Fix BFSDeepCrawlStrategy processing URLs that vary based on base domain or port so they only process once. The common case for this is www.example.com vs example.com but it also addresses https://example.com vs https://example.com:443. Fixes #843
diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -10,7 +10,7 @@
 from .scorers import URLScorer
 from . import DeepCrawlStrategy
 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
-from ..utils import normalize_url_for_deep_crawl
+from ..utils import normalize_url_for_deep_crawl, base_domain_url
 from math import inf as infinity
 
 class BFSDeepCrawlStrategy(DeepCrawlStrategy):
@@ -96,19 +96,31 @@ async def link_discovery(
         if self.include_external:
             links += result.links.get("external", [])
 
-        valid_links = []
-        
+        valid_links: List[Tuple[str, float]] = []
+
         # First collect all valid links
+        seen: Set[str] = set()
         for link in links:
             url: Optional[str] = link.get("href")
             if not url:
                 continue
 
             # Strip URL fragments to avoid duplicate crawling
-            # base_url = url.split('#')[0] if url else url
             base_url = normalize_url_for_deep_crawl(url, source_url)
-            if base_url in visited:
+            if base_url in visited or base_url in seen:
+                continue
+
+            # Normalize the URL to its base domain
+            domain_url: str = base_domain_url(base_url)
+            if domain_url in visited or domain_url in seen:
                 continue
+
+            # Register as seen so we don't process it again, this avoids duplicates
+            # for URLs which have the same base domain, which would otherwise be
+            # added to next_depth multiple times. This also eliminates duplicate
+            # work in this loop processing the same URL multiple times.
+            seen.add(domain_url)
+
             if not await self.can_process_url(url, next_depth):
                 self.stats.urls_skipped += 1
                 continue
@@ -161,7 +173,7 @@ async def _arun_batch(
 
         while current_level and not self._cancel_event.is_set():
             next_level: List[Tuple[str, Optional[str]]] = []
-            urls = [url for url, _ in current_level]
+            urls = [base_domain_url(url) for url, _ in current_level]
             visited.update(urls)
 
             # Clone the config to disable deep crawling recursion and enforce batch mode.
@@ -203,7 +215,7 @@ async def _arun_stream(
 
         while current_level and not self._cancel_event.is_set():
             next_level: List[Tuple[str, Optional[str]]] = []
-            urls = [url for url, _ in current_level]
+            urls = [base_domain_url(url) for url, _ in current_level]
             visited.update(urls)
 
             stream_config = config.clone(deep_crawl_strategy=None, stream=True)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -1974,8 +1974,6 @@ def fast_format_html(html_string):
 
 def normalize_url(href, base_url):
     """Normalize URLs to ensure consistent format"""
-    from urllib.parse import urljoin, urlparse
-
     # Parse base URL to get components
     parsed_base = urlparse(base_url)
     if not parsed_base.scheme or not parsed_base.netloc:
@@ -1988,7 +1986,7 @@ def normalize_url(href, base_url):
 
 def normalize_url_for_deep_crawl(href: str, base_url: str) -> str:
     """Normalize URLs to ensure consistent format"""
-    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
+    from urllib.parse import parse_qs, urlencode
 
     # Use urljoin to handle relative URLs
     full_url = urljoin(base_url, href.strip())
@@ -2029,6 +2027,37 @@ def normalize_url_for_deep_crawl(href: str, base_url: str) -> str:
     
     return normalized
 
+def base_domain_url(url: str) -> str:
+    """Return the URL using the base domain.
+
+    This can be used to ensure that we don't revisit the same URL
+    multiple times even if the domain changes e.g. www.example.com vs example.com
+    or if the URL has a port number which is not needed e.g. example.com:80.
+
+    See `get_base_domain` for details on how the base domain is extracted.
+    Args:
+        url (str): The URL to extract the base domain from.
+    Returns:
+        str: The url using the base domain or url unchanged if parsing fails.
+    """
+
+    try:
+        parsed: ParseResult = urlparse(url)
+        base_domain: str = get_base_domain_parsed(parsed)
+        if not base_domain:
+            return url
+
+        return urlunparse((
+            parsed.scheme,
+            base_domain,
+            parsed.path,
+            parsed.params,
+            parsed.query,
+            ""
+        ))
+    except Exception:
+        return url
+
 @lru_cache(maxsize=10000)
 def efficient_normalize_url_for_deep_crawl(href, base_url):
     """Efficient URL normalization with proper parsing"""
@@ -2112,47 +2141,64 @@ def get_base_domain(url: str) -> str:
     """
     try:
         parsed: ParseResult = urlparse(url)
-        domain = parsed.netloc.lower()
-        if not domain:
-            return ""
-
-        # Remove port if present
-        domain = domain.split(":")[0]
-
-        # Remove www
-        domain = domain.removeprefix("www.")
-
-        port_suffix: str = ""
-        port = parsed.port
-        if port is not None and port != DEFAULT_PORTS.get(parsed.scheme):
-            # Port needed.
-            port_suffix = f":{port}"
-
-
-        # Extract last two parts of domain (handles co.uk etc)
-        parts = domain.split(".")
-        if len(parts) > 2 and parts[-2] in {
-            "co",
-            "com",
-            "org",
-            "gov",
-            "edu",
-            "net",
-            "mil",
-            "int",
-            "ac",
-            "ad",
-            "ae",
-            "af",
-            "ag",
-        }:
-            return ".".join(parts[-3:]) + port_suffix
-
-        return ".".join(parts[-2:]) + port_suffix
+        return get_base_domain_parsed(parsed)
     except Exception:
         return ""
 
 
+def get_base_domain_parsed(parsed: ParseResult) -> str:
+    """
+    Extract the base domain from the parsed URL, handling common edge cases.
+
+    How it works:
+    1. Parses the URL to extract the domain.
+    2. Removes the port number and 'www' prefix if necessary.
+    3. Handles special domains (e.g., 'co.uk') to extract the correct base.
+
+    Args:
+        parsed (str): The parsed URL to extract the base domain from.
+
+    Returns:
+        str: The extracted base domain or an empty string if netloc is empty.
+    """
+    domain: str = parsed.netloc.lower()
+    if not domain:
+        return ""
+
+    # Remove port if present
+    domain = domain.split(":")[0]
+
+    # Remove www
+    domain = domain.removeprefix("www.")
+
+    port_suffix: str = ""
+    port = parsed.port
+    if port is not None and port != DEFAULT_PORTS.get(parsed.scheme):
+        # Port needed.
+        port_suffix = f":{port}"
+
+    # Extract last two parts of domain (handles co.uk etc)
+    parts = domain.split(".")
+    if len(parts) > 2 and parts[-2] in {
+        "co",
+        "com",
+        "org",
+        "gov",
+        "edu",
+        "net",
+        "mil",
+        "int",
+        "ac",
+        "ad",
+        "ae",
+        "af",
+        "ag",
+    }:
+        return ".".join(parts[-3:]) + port_suffix
+
+    return ".".join(parts[-2:]) + port_suffix
+
+
 def is_external_url(url: str, base_domain: str) -> bool:
     """
     Extract the base domain from a given URL, handling common edge cases.
@@ -2753,8 +2799,8 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
             return result[:max_size] + "..."
             
         return result
-    
-    except Exception as e:
+
+    except Exception:
         # Fallback for parsing errors
         return html_content[:max_size] if len(html_content) > max_size else html_content
     
diff --git a/tests/20241401/test_deep_crawl.py b/tests/20241401/test_deep_crawl.py
@@ -1,14 +1,17 @@
 import sys
 import time
+import socket
 from httpx import codes
 import pytest
+import pytest_asyncio
 
 
 from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
 from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 
 from pytest_httpserver import HTTPServer
+from unittest.mock import patch
 
 
 URLS = [
@@ -18,6 +21,12 @@
     "/level2/article2",
 ]
 
+@pytest_asyncio.fixture
+def mock_dns():
+    with patch('socket.gethostbyname') as mock_gethostbyname:
+        mock_gethostbyname.side_effect = lambda host: socket.gethostbyname('localhost' if host == 'www.localhost' else host)
+        yield
+
 @pytest.fixture
 def site(httpserver: HTTPServer) -> HTTPServer:
     """Fixture to serve multiple pages for a crawl."""
@@ -26,9 +35,11 @@ def site(httpserver: HTTPServer) -> HTTPServer:
         <a href="/level1">Go to level 1</a>
         </body></html>
     """)
-    httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data="""
+    httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data=f"""
         <html><body>
-        <a href="/level2/article1">Go to level 2 - Article 1</a>
+        <a href="/level2/article1">Go to level 2 - Article 1 (relative)</a>
+        <a href="{httpserver.url_for("/level2/article1")}">Go to level 2 - Article 1 (absolute)</a>
+        <a href="{httpserver.url_for("/level2/article1").replace("localhost", "www.localhost")}">Go to level 2 - Article 1 (absolute + www prefix)</a>
         <a href="/level2/article2">Go to level 2 - Article 2</a>
         </body></html>
     """)
@@ -42,10 +53,12 @@ def site(httpserver: HTTPServer) -> HTTPServer:
         <p>This is level 2 - Article 2</p>
         </body></html>
     """)
+    httpserver.expect_request("/favicon.ico").respond_with_data(status=codes.NOT_FOUND)
+
     return httpserver
 
 @pytest.mark.asyncio
-async def test_deep_crawl_batch(site: HTTPServer):
+async def test_deep_crawl_batch(site: HTTPServer, mock_dns):
     config = CrawlerRunConfig(
         deep_crawl_strategy = BFSDeepCrawlStrategy(
             max_depth=2,
@@ -73,8 +86,10 @@ async def test_deep_crawl_batch(site: HTTPServer):
             assert result.url == site.url_for(URLS[idx])
             assert result.status_code == codes.OK
 
+    site.check_assertions()
+
 @pytest.mark.asyncio
-async def test_deep_crawl_stream(site: HTTPServer):
+async def test_deep_crawl_stream(site: HTTPServer, mock_dns):
     config = CrawlerRunConfig(
         deep_crawl_strategy = BFSDeepCrawlStrategy(
             max_depth=2,
@@ -106,6 +121,8 @@ async def test_deep_crawl_stream(site: HTTPServer):
         print(f"Crawled {idx} pages")
         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
 
+    site.check_assertions()
+
 if __name__ == "__main__":
     import subprocess