fix: deep crawl duplicate url processing

stevenh · stevenh · commit d02f9a4aa289 · 2025-04-24T19:30:16.000+01:00
Fix BFSDeepCrawlStrategy processing URLs that vary based on base domain or port so they only process once. The common case for this is www.example.com vs example.com but it also addresses https://example.com vs https://example.com:443. Fixes #843
diff --git a/crawl4ai/deep_crawling/bfs_strategy.py b/crawl4ai/deep_crawling/bfs_strategy.py
@@ -10,7 +10,7 @@
 from .scorers import URLScorer
 from . import DeepCrawlStrategy
 from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
-from ..utils import normalize_url_for_deep_crawl
+from ..utils import normalize_url_for_deep_crawl, comparison_url
 from math import inf as infinity
 
 class BFSDeepCrawlStrategy(DeepCrawlStrategy):
@@ -96,34 +96,46 @@ async def link_discovery(
         if self.include_external:
             links += result.links.get("external", [])
 
-        valid_links = []
-        
+        valid_links: List[Tuple[str, float]] = []
+
         # First collect all valid links
+        seen: Set[str] = set()
         for link in links:
             url: Optional[str] = link.get("href")
             if not url:
                 continue
 
             # Strip URL fragments to avoid duplicate crawling
-            # base_url = url.split('#')[0] if url else url
-            base_url = normalize_url_for_deep_crawl(url, source_url)
-            if base_url in visited:
+            normalised_url = normalize_url_for_deep_crawl(url, source_url)
+            if normalised_url in visited or normalised_url in seen:
+                continue
+
+            # Check if we've seen this URL before, using the comparison URL.
+            comp_url: str = comparison_url(normalised_url)
+            if comp_url in visited or comp_url in seen:
                 continue
+
+            # Register as seen so we don't process it again, this avoids duplicates
+            # for URLs which have the same base domain, which would otherwise be
+            # added to next_depth multiple times. This also eliminates duplicate
+            # work in this loop processing the same URL multiple times.
+            seen.add(comp_url)
+
             if not await self.can_process_url(url, next_depth):
                 self.stats.urls_skipped += 1
                 continue
 
             # Score the URL if a scorer is provided
-            score = self.url_scorer.score(base_url) if self.url_scorer else 0
-            
+            score = self.url_scorer.score(normalised_url) if self.url_scorer else 0
+
             # Skip URLs with scores below the threshold
             if score < self.score_threshold:
                 self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
                 self.stats.urls_skipped += 1
                 continue
-            
-            valid_links.append((base_url, score))
-        
+
+            valid_links.append((normalised_url, score))
+
         # If we have more valid links than capacity, sort by score and take the top ones
         if self.max_pages > 0 and len(valid_links) > remaining_capacity:
             if self.url_scorer:
@@ -162,7 +174,8 @@ async def _arun_batch(
         while current_level and not self._cancel_event.is_set():
             next_level: List[Tuple[str, Optional[str]]] = []
             urls = [url for url, _ in current_level]
-            visited.update(urls)
+
+            visited.update([comparison_url(url) for url in urls])
 
             # Clone the config to disable deep crawling recursion and enforce batch mode.
             batch_config = config.clone(deep_crawl_strategy=None, stream=False)
@@ -204,7 +217,7 @@ async def _arun_stream(
         while current_level and not self._cancel_event.is_set():
             next_level: List[Tuple[str, Optional[str]]] = []
             urls = [url for url, _ in current_level]
-            visited.update(urls)
+            visited.update([comparison_url(url) for url in urls])
 
             stream_config = config.clone(deep_crawl_strategy=None, stream=True)
             stream_gen = await crawler.arun_many(urls=urls, config=stream_config)
diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
@@ -1974,8 +1974,6 @@ def fast_format_html(html_string):
 
 def normalize_url(href, base_url):
     """Normalize URLs to ensure consistent format"""
-    from urllib.parse import urljoin, urlparse
-
     # Parse base URL to get components
     parsed_base = urlparse(base_url)
     if not parsed_base.scheme or not parsed_base.netloc:
@@ -1988,47 +1986,26 @@ def normalize_url(href, base_url):
 
 def normalize_url_for_deep_crawl(href: str, base_url: str) -> str:
     """Normalize URLs to ensure consistent format"""
-    from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
 
     # Use urljoin to handle relative URLs
     full_url = urljoin(base_url, href.strip())
     
     # Parse the URL for normalization
-    parsed = urlparse(full_url)
-    
-    # Convert hostname to lowercase
-    netloc = parsed.netloc.lower()
-    
-    # Remove fragment entirely
-    fragment = ''
-    
-    # Normalize query parameters if needed
-    query = parsed.query
-    if query:
-        # Parse query parameters
-        params = parse_qs(query)
-        
-        # Remove tracking parameters (example - customize as needed)
-        tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
-        for param in tracking_params:
-            if param in params:
-                del params[param]
-                
-        # Rebuild query string, sorted for consistency
-        query = urlencode(params, doseq=True) if params else ''
-    
+    parsed: ParseResult = urlparse(full_url)
+
     # Build normalized URL
     normalized = urlunparse((
         parsed.scheme,
-        netloc,
+        normalize_netloc(parsed),
         parsed.path.rstrip('/') or '/',  # Normalize trailing slash
         parsed.params,
-        query,
-        fragment
+        normalize_query(parsed.query),
+        "", # Remove fragment entirely
     ))
-    
+
     return normalized
 
+
 @lru_cache(maxsize=10000)
 def efficient_normalize_url_for_deep_crawl(href, base_url):
     """Efficient URL normalization with proper parsing"""
@@ -2152,6 +2129,91 @@ def get_base_domain(url: str) -> str:
     except Exception:
         return ""
 
+def normalize_netloc(parsed: ParseResult, remove_www: bool = False) -> str:
+    """
+    Normalize the netloc (network location) of a parsed URL.
+
+    Ensures the netloc is in lowercase, removes the port if it matches the
+    default for the scheme, and optionally removes the 'www.' prefix.
+
+    If remove_www is True, the returned netloc may not result in a valid URL
+    or one which would result in the same content being fetched. If should
+    only be used for comparison purposes.
+
+    Args:
+        parsed (ParseResult): The parsed URL.
+        remove_www (bool): Whether to remove the 'www.' prefix. Defaults to False.
+    Returns:
+        str: The normalized netloc.
+    """
+    netloc: str = parsed.netloc.lower()
+    if not netloc:
+        return ""
+
+    # Remove port.
+    netloc = netloc.split(":")[0]
+    if remove_www:
+        netloc = netloc.removeprefix("www.")
+
+    port = parsed.port
+    if port is not None and port != DEFAULT_PORTS.get(parsed.scheme):
+        # Port needed.
+        netloc = f"{netloc}:{port}"
+
+    return netloc
+
+def normalize_query(query: str) -> str:
+    """
+    Normalize the query parameters of a parsed URL.
+    Ensures that tracking parameters are removed and the query string is
+    sorted for consistency.
+    Args:
+        parsed (ParseResult): The parsed URL.
+    Returns:
+        str: The normalized query string.
+    """
+    from urllib.parse import parse_qs, urlencode
+
+    if not query:
+        return ""
+
+    # Parse query parameters
+    params = parse_qs(query)
+
+    # Remove tracking parameters (example - customize as needed)
+    tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
+    for param in tracking_params:
+        if param in params:
+            del params[param]
+
+    # Rebuild query string, sorted for consistency
+    return urlencode(params, doseq=True) if params else ""
+
+def comparison_url(url: str) -> str:
+    """
+    Return a URL which can be used for comparison purposes only.
+
+    The returned URL is not guaranteed to be valid or to result in the
+    same content being fetched as the original.
+
+    Args:
+        url (str): The URL to normalize.
+
+    Returns:
+        str: The comparison URL or the original URL if parsing fails.
+    """
+    try:
+        parsed: ParseResult = urlparse(url)
+        return urlunparse((
+            "https",
+            normalize_netloc(parsed, remove_www=True),
+            parsed.path,
+            parsed.params,
+            normalize_query(parsed.query),
+            ""
+        ))
+    except Exception:
+        return url
 
 def is_external_url(url: str, base_domain: str) -> bool:
     """
@@ -2753,8 +2815,8 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
             return result[:max_size] + "..."
             
         return result
-    
-    except Exception as e:
+
+    except Exception:
         # Fallback for parsing errors
         return html_content[:max_size] if len(html_content) > max_size else html_content
     
diff --git a/tests/20241401/test_deep_crawl.py b/tests/20241401/test_deep_crawl.py
@@ -1,5 +1,7 @@
 import sys
 import time
+import socket
+from typing import Generator, Any
 from httpx import codes
 import pytest
 
@@ -9,6 +11,7 @@
 from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
 
 from pytest_httpserver import HTTPServer
+from unittest.mock import patch
 
 
 URLS = [
@@ -18,6 +21,12 @@
     "/level2/article2",
 ]
 
+@pytest.fixture
+def mock_dns() -> Generator[None, Any, None]:
+    with patch('socket.gethostbyname') as mock_gethostbyname:
+        mock_gethostbyname.side_effect = lambda host: socket.gethostbyname('localhost' if host == 'www.localhost' else host)
+        yield
+
 @pytest.fixture
 def site(httpserver: HTTPServer) -> HTTPServer:
     """Fixture to serve multiple pages for a crawl."""
@@ -26,9 +35,12 @@ def site(httpserver: HTTPServer) -> HTTPServer:
         <a href="/level1">Go to level 1</a>
         </body></html>
     """)
-    httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data="""
+    httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data=f"""
         <html><body>
-        <a href="/level2/article1">Go to level 2 - Article 1</a>
+        <a href="/level2/article1">Go to level 2 - Article 1 (relative)</a>
+        <a href="{httpserver.url_for("/level2/article1")}">Go to level 2 - Article 1 (absolute)</a>
+        <a href="{httpserver.url_for("/level2/article1").replace("localhost", "www.localhost")}">Go to level 2 - Article 1 (absolute + www prefix)</a>
+        <a href="{httpserver.url_for("/level2/article1").replace("localhost", "localhost:80")}">Go to level 2 - Article 1 (absolute + schema port)</a>
         <a href="/level2/article2">Go to level 2 - Article 2</a>
         </body></html>
     """)
@@ -42,10 +54,12 @@ def site(httpserver: HTTPServer) -> HTTPServer:
         <p>This is level 2 - Article 2</p>
         </body></html>
     """)
+    httpserver.expect_request("/favicon.ico").respond_with_data(status=codes.NOT_FOUND)
+
     return httpserver
 
 @pytest.mark.asyncio
-async def test_deep_crawl_batch(site: HTTPServer):
+async def test_deep_crawl_batch(site: HTTPServer, mock_dns: Generator[None, Any, None]):
     config = CrawlerRunConfig(
         deep_crawl_strategy = BFSDeepCrawlStrategy(
             max_depth=2,
@@ -73,8 +87,10 @@ async def test_deep_crawl_batch(site: HTTPServer):
             assert result.url == site.url_for(URLS[idx])
             assert result.status_code == codes.OK
 
+    site.check_assertions()
+
 @pytest.mark.asyncio
-async def test_deep_crawl_stream(site: HTTPServer):
+async def test_deep_crawl_stream(site: HTTPServer, mock_dns: Generator[None, Any, None]):
     config = CrawlerRunConfig(
         deep_crawl_strategy = BFSDeepCrawlStrategy(
             max_depth=2,
@@ -106,6 +122,8 @@ async def test_deep_crawl_stream(site: HTTPServer):
         print(f"Crawled {idx} pages")
         print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
 
+    site.check_assertions()
+
 if __name__ == "__main__":
     import subprocess