Skip to content

Commit d02f9a4

Browse files
committed
fix: deep crawl duplicate url processing
Fix BFSDeepCrawlStrategy processing URLs that vary based on base domain or port so they only process once. The common case for this is www.example.com vs example.com but it also addresses https://example.com vs https://example.com:443. Fixes #843
1 parent fb7fe6a commit d02f9a4

File tree

3 files changed

+142
-49
lines changed

3 files changed

+142
-49
lines changed

crawl4ai/deep_crawling/bfs_strategy.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from .scorers import URLScorer
1111
from . import DeepCrawlStrategy
1212
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
13-
from ..utils import normalize_url_for_deep_crawl
13+
from ..utils import normalize_url_for_deep_crawl, comparison_url
1414
from math import inf as infinity
1515

1616
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
@@ -96,34 +96,46 @@ async def link_discovery(
9696
if self.include_external:
9797
links += result.links.get("external", [])
9898

99-
valid_links = []
100-
99+
valid_links: List[Tuple[str, float]] = []
100+
101101
# First collect all valid links
102+
seen: Set[str] = set()
102103
for link in links:
103104
url: Optional[str] = link.get("href")
104105
if not url:
105106
continue
106107

107108
# Strip URL fragments to avoid duplicate crawling
108-
# base_url = url.split('#')[0] if url else url
109-
base_url = normalize_url_for_deep_crawl(url, source_url)
110-
if base_url in visited:
109+
normalised_url = normalize_url_for_deep_crawl(url, source_url)
110+
if normalised_url in visited or normalised_url in seen:
111+
continue
112+
113+
# Check if we've seen this URL before, using the comparison URL.
114+
comp_url: str = comparison_url(normalised_url)
115+
if comp_url in visited or comp_url in seen:
111116
continue
117+
118+
# Register as seen so we don't process it again, this avoids duplicates
119+
# for URLs which have the same base domain, which would otherwise be
120+
# added to next_depth multiple times. This also eliminates duplicate
121+
# work in this loop processing the same URL multiple times.
122+
seen.add(comp_url)
123+
112124
if not await self.can_process_url(url, next_depth):
113125
self.stats.urls_skipped += 1
114126
continue
115127

116128
# Score the URL if a scorer is provided
117-
score = self.url_scorer.score(base_url) if self.url_scorer else 0
118-
129+
score = self.url_scorer.score(normalised_url) if self.url_scorer else 0
130+
119131
# Skip URLs with scores below the threshold
120132
if score < self.score_threshold:
121133
self.logger.debug(f"URL {url} skipped: score {score} below threshold {self.score_threshold}")
122134
self.stats.urls_skipped += 1
123135
continue
124-
125-
valid_links.append((base_url, score))
126-
136+
137+
valid_links.append((normalised_url, score))
138+
127139
# If we have more valid links than capacity, sort by score and take the top ones
128140
if self.max_pages > 0 and len(valid_links) > remaining_capacity:
129141
if self.url_scorer:
@@ -162,7 +174,8 @@ async def _arun_batch(
162174
while current_level and not self._cancel_event.is_set():
163175
next_level: List[Tuple[str, Optional[str]]] = []
164176
urls = [url for url, _ in current_level]
165-
visited.update(urls)
177+
178+
visited.update([comparison_url(url) for url in urls])
166179

167180
# Clone the config to disable deep crawling recursion and enforce batch mode.
168181
batch_config = config.clone(deep_crawl_strategy=None, stream=False)
@@ -204,7 +217,7 @@ async def _arun_stream(
204217
while current_level and not self._cancel_event.is_set():
205218
next_level: List[Tuple[str, Optional[str]]] = []
206219
urls = [url for url, _ in current_level]
207-
visited.update(urls)
220+
visited.update([comparison_url(url) for url in urls])
208221

209222
stream_config = config.clone(deep_crawl_strategy=None, stream=True)
210223
stream_gen = await crawler.arun_many(urls=urls, config=stream_config)

crawl4ai/utils.py

Lines changed: 94 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1974,8 +1974,6 @@ def fast_format_html(html_string):
19741974

19751975
def normalize_url(href, base_url):
19761976
"""Normalize URLs to ensure consistent format"""
1977-
from urllib.parse import urljoin, urlparse
1978-
19791977
# Parse base URL to get components
19801978
parsed_base = urlparse(base_url)
19811979
if not parsed_base.scheme or not parsed_base.netloc:
@@ -1988,47 +1986,26 @@ def normalize_url(href, base_url):
19881986

19891987
def normalize_url_for_deep_crawl(href: str, base_url: str) -> str:
19901988
"""Normalize URLs to ensure consistent format"""
1991-
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
19921989

19931990
# Use urljoin to handle relative URLs
19941991
full_url = urljoin(base_url, href.strip())
19951992

19961993
# Parse the URL for normalization
1997-
parsed = urlparse(full_url)
1998-
1999-
# Convert hostname to lowercase
2000-
netloc = parsed.netloc.lower()
2001-
2002-
# Remove fragment entirely
2003-
fragment = ''
2004-
2005-
# Normalize query parameters if needed
2006-
query = parsed.query
2007-
if query:
2008-
# Parse query parameters
2009-
params = parse_qs(query)
2010-
2011-
# Remove tracking parameters (example - customize as needed)
2012-
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
2013-
for param in tracking_params:
2014-
if param in params:
2015-
del params[param]
2016-
2017-
# Rebuild query string, sorted for consistency
2018-
query = urlencode(params, doseq=True) if params else ''
2019-
1994+
parsed: ParseResult = urlparse(full_url)
1995+
20201996
# Build normalized URL
20211997
normalized = urlunparse((
20221998
parsed.scheme,
2023-
netloc,
1999+
normalize_netloc(parsed),
20242000
parsed.path.rstrip('/') or '/', # Normalize trailing slash
20252001
parsed.params,
2026-
query,
2027-
fragment
2002+
normalize_query(parsed.query),
2003+
"", # Remove fragment entirely
20282004
))
2029-
2005+
20302006
return normalized
20312007

2008+
20322009
@lru_cache(maxsize=10000)
20332010
def efficient_normalize_url_for_deep_crawl(href, base_url):
20342011
"""Efficient URL normalization with proper parsing"""
@@ -2152,6 +2129,91 @@ def get_base_domain(url: str) -> str:
21522129
except Exception:
21532130
return ""
21542131

2132+
def normalize_netloc(parsed: ParseResult, remove_www: bool = False) -> str:
2133+
"""
2134+
Normalize the netloc (network location) of a parsed URL.
2135+
2136+
Ensures the netloc is in lowercase, removes the port if it matches the
2137+
default for the scheme, and optionally removes the 'www.' prefix.
2138+
2139+
If remove_www is True, the returned netloc may not result in a valid URL
2140+
or one which would result in the same content being fetched. If should
2141+
only be used for comparison purposes.
2142+
2143+
Args:
2144+
parsed (ParseResult): The parsed URL.
2145+
remove_www (bool): Whether to remove the 'www.' prefix. Defaults to False.
2146+
Returns:
2147+
str: The normalized netloc.
2148+
"""
2149+
netloc: str = parsed.netloc.lower()
2150+
if not netloc:
2151+
return ""
2152+
2153+
# Remove port.
2154+
netloc = netloc.split(":")[0]
2155+
if remove_www:
2156+
netloc = netloc.removeprefix("www.")
2157+
2158+
port = parsed.port
2159+
if port is not None and port != DEFAULT_PORTS.get(parsed.scheme):
2160+
# Port needed.
2161+
netloc = f"{netloc}:{port}"
2162+
2163+
return netloc
2164+
2165+
def normalize_query(query: str) -> str:
2166+
"""
2167+
Normalize the query parameters of a parsed URL.
2168+
Ensures that tracking parameters are removed and the query string is
2169+
sorted for consistency.
2170+
Args:
2171+
parsed (ParseResult): The parsed URL.
2172+
Returns:
2173+
str: The normalized query string.
2174+
"""
2175+
from urllib.parse import parse_qs, urlencode
2176+
2177+
if not query:
2178+
return ""
2179+
2180+
# Parse query parameters
2181+
params = parse_qs(query)
2182+
2183+
# Remove tracking parameters (example - customize as needed)
2184+
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
2185+
for param in tracking_params:
2186+
if param in params:
2187+
del params[param]
2188+
2189+
# Rebuild query string, sorted for consistency
2190+
return urlencode(params, doseq=True) if params else ""
2191+
2192+
def comparison_url(url: str) -> str:
2193+
"""
2194+
Return a URL which can be used for comparison purposes only.
2195+
2196+
The returned URL is not guaranteed to be valid or to result in the
2197+
same content being fetched as the original.
2198+
2199+
Args:
2200+
url (str): The URL to normalize.
2201+
2202+
Returns:
2203+
str: The comparison URL or the original URL if parsing fails.
2204+
"""
2205+
try:
2206+
parsed: ParseResult = urlparse(url)
2207+
return urlunparse((
2208+
"https",
2209+
normalize_netloc(parsed, remove_www=True),
2210+
parsed.path,
2211+
parsed.params,
2212+
normalize_query(parsed.query),
2213+
""
2214+
))
2215+
except Exception:
2216+
return url
21552217

21562218
def is_external_url(url: str, base_domain: str) -> bool:
21572219
"""
@@ -2753,8 +2815,8 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
27532815
return result[:max_size] + "..."
27542816

27552817
return result
2756-
2757-
except Exception as e:
2818+
2819+
except Exception:
27582820
# Fallback for parsing errors
27592821
return html_content[:max_size] if len(html_content) > max_size else html_content
27602822

tests/20241401/test_deep_crawl.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import sys
22
import time
3+
import socket
4+
from typing import Generator, Any
35
from httpx import codes
46
import pytest
57

@@ -9,6 +11,7 @@
911
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
1012

1113
from pytest_httpserver import HTTPServer
14+
from unittest.mock import patch
1215

1316

1417
URLS = [
@@ -18,6 +21,12 @@
1821
"/level2/article2",
1922
]
2023

24+
@pytest.fixture
25+
def mock_dns() -> Generator[None, Any, None]:
26+
with patch('socket.gethostbyname') as mock_gethostbyname:
27+
mock_gethostbyname.side_effect = lambda host: socket.gethostbyname('localhost' if host == 'www.localhost' else host)
28+
yield
29+
2130
@pytest.fixture
2231
def site(httpserver: HTTPServer) -> HTTPServer:
2332
"""Fixture to serve multiple pages for a crawl."""
@@ -26,9 +35,12 @@ def site(httpserver: HTTPServer) -> HTTPServer:
2635
<a href="/level1">Go to level 1</a>
2736
</body></html>
2837
""")
29-
httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data="""
38+
httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data=f"""
3039
<html><body>
31-
<a href="/level2/article1">Go to level 2 - Article 1</a>
40+
<a href="/level2/article1">Go to level 2 - Article 1 (relative)</a>
41+
<a href="{httpserver.url_for("/level2/article1")}">Go to level 2 - Article 1 (absolute)</a>
42+
<a href="{httpserver.url_for("/level2/article1").replace("localhost", "www.localhost")}">Go to level 2 - Article 1 (absolute + www prefix)</a>
43+
<a href="{httpserver.url_for("/level2/article1").replace("localhost", "localhost:80")}">Go to level 2 - Article 1 (absolute + schema port)</a>
3244
<a href="/level2/article2">Go to level 2 - Article 2</a>
3345
</body></html>
3446
""")
@@ -42,10 +54,12 @@ def site(httpserver: HTTPServer) -> HTTPServer:
4254
<p>This is level 2 - Article 2</p>
4355
</body></html>
4456
""")
57+
httpserver.expect_request("/favicon.ico").respond_with_data(status=codes.NOT_FOUND)
58+
4559
return httpserver
4660

4761
@pytest.mark.asyncio
48-
async def test_deep_crawl_batch(site: HTTPServer):
62+
async def test_deep_crawl_batch(site: HTTPServer, mock_dns: Generator[None, Any, None]):
4963
config = CrawlerRunConfig(
5064
deep_crawl_strategy = BFSDeepCrawlStrategy(
5165
max_depth=2,
@@ -73,8 +87,10 @@ async def test_deep_crawl_batch(site: HTTPServer):
7387
assert result.url == site.url_for(URLS[idx])
7488
assert result.status_code == codes.OK
7589

90+
site.check_assertions()
91+
7692
@pytest.mark.asyncio
77-
async def test_deep_crawl_stream(site: HTTPServer):
93+
async def test_deep_crawl_stream(site: HTTPServer, mock_dns: Generator[None, Any, None]):
7894
config = CrawlerRunConfig(
7995
deep_crawl_strategy = BFSDeepCrawlStrategy(
8096
max_depth=2,
@@ -106,6 +122,8 @@ async def test_deep_crawl_stream(site: HTTPServer):
106122
print(f"Crawled {idx} pages")
107123
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
108124

125+
site.check_assertions()
126+
109127
if __name__ == "__main__":
110128
import subprocess
111129

0 commit comments

Comments
 (0)