Skip to content

Commit 3a1e872

Browse files
committed
fix: deep crawl duplicate url processing
Fix BFSDeepCrawlStrategy processing URLs that vary based on base domain or port so they only process once. The common case for this is www.example.com vs example.com but it also addresses https://example.com vs https://example.com:443. Fixes #843
1 parent fb7fe6a commit 3a1e872

File tree

3 files changed

+128
-53
lines changed

3 files changed

+128
-53
lines changed

crawl4ai/deep_crawling/bfs_strategy.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from .scorers import URLScorer
1111
from . import DeepCrawlStrategy
1212
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
13-
from ..utils import normalize_url_for_deep_crawl
13+
from ..utils import normalize_url_for_deep_crawl, base_domain_url
1414
from math import inf as infinity
1515

1616
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
@@ -96,19 +96,31 @@ async def link_discovery(
9696
if self.include_external:
9797
links += result.links.get("external", [])
9898

99-
valid_links = []
100-
99+
valid_links: List[Tuple[str, float]] = []
100+
101101
# First collect all valid links
102+
seen: Set[str] = set()
102103
for link in links:
103104
url: Optional[str] = link.get("href")
104105
if not url:
105106
continue
106107

107108
# Strip URL fragments to avoid duplicate crawling
108-
# base_url = url.split('#')[0] if url else url
109109
base_url = normalize_url_for_deep_crawl(url, source_url)
110-
if base_url in visited:
110+
if base_url in visited or base_url in seen:
111+
continue
112+
113+
# Normalize the URL to its base domain
114+
domain_url: str = base_domain_url(base_url)
115+
if domain_url in visited or domain_url in seen:
111116
continue
117+
118+
# Register as seen so we don't process it again, this avoids duplicates
119+
# for URLs which have the same base domain, which would otherwise be
120+
# added to next_depth multiple times. This also eliminates duplicate
121+
# work in this loop processing the same URL multiple times.
122+
seen.add(domain_url)
123+
112124
if not await self.can_process_url(url, next_depth):
113125
self.stats.urls_skipped += 1
114126
continue
@@ -161,7 +173,7 @@ async def _arun_batch(
161173

162174
while current_level and not self._cancel_event.is_set():
163175
next_level: List[Tuple[str, Optional[str]]] = []
164-
urls = [url for url, _ in current_level]
176+
urls = [base_domain_url(url) for url, _ in current_level]
165177
visited.update(urls)
166178

167179
# Clone the config to disable deep crawling recursion and enforce batch mode.
@@ -203,7 +215,7 @@ async def _arun_stream(
203215

204216
while current_level and not self._cancel_event.is_set():
205217
next_level: List[Tuple[str, Optional[str]]] = []
206-
urls = [url for url, _ in current_level]
218+
urls = [base_domain_url(url) for url, _ in current_level]
207219
visited.update(urls)
208220

209221
stream_config = config.clone(deep_crawl_strategy=None, stream=True)

crawl4ai/utils.py

Lines changed: 88 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1974,8 +1974,6 @@ def fast_format_html(html_string):
19741974

19751975
def normalize_url(href, base_url):
19761976
"""Normalize URLs to ensure consistent format"""
1977-
from urllib.parse import urljoin, urlparse
1978-
19791977
# Parse base URL to get components
19801978
parsed_base = urlparse(base_url)
19811979
if not parsed_base.scheme or not parsed_base.netloc:
@@ -1988,7 +1986,7 @@ def normalize_url(href, base_url):
19881986

19891987
def normalize_url_for_deep_crawl(href: str, base_url: str) -> str:
19901988
"""Normalize URLs to ensure consistent format"""
1991-
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
1989+
from urllib.parse import parse_qs, urlencode
19921990

19931991
# Use urljoin to handle relative URLs
19941992
full_url = urljoin(base_url, href.strip())
@@ -2029,6 +2027,37 @@ def normalize_url_for_deep_crawl(href: str, base_url: str) -> str:
20292027

20302028
return normalized
20312029

2030+
def base_domain_url(url: str) -> str:
2031+
"""Return the URL using the base domain.
2032+
2033+
This can be used to ensure that we don't revisit the same URL
2034+
multiple times even if the domain changes e.g. www.example.com vs example.com
2035+
or if the URL has a port number which is not needed e.g. example.com:80.
2036+
2037+
See `get_base_domain` for details on how the base domain is extracted.
2038+
Args:
2039+
url (str): The URL to extract the base domain from.
2040+
Returns:
2041+
str: The url using the base domain or url unchanged if parsing fails.
2042+
"""
2043+
2044+
try:
2045+
parsed: ParseResult = urlparse(url)
2046+
base_domain: str = get_base_domain_parsed(parsed)
2047+
if not base_domain:
2048+
return url
2049+
2050+
return urlunparse((
2051+
parsed.scheme,
2052+
base_domain,
2053+
parsed.path,
2054+
parsed.params,
2055+
parsed.query,
2056+
""
2057+
))
2058+
except Exception:
2059+
return url
2060+
20322061
@lru_cache(maxsize=10000)
20332062
def efficient_normalize_url_for_deep_crawl(href, base_url):
20342063
"""Efficient URL normalization with proper parsing"""
@@ -2112,47 +2141,64 @@ def get_base_domain(url: str) -> str:
21122141
"""
21132142
try:
21142143
parsed: ParseResult = urlparse(url)
2115-
domain = parsed.netloc.lower()
2116-
if not domain:
2117-
return ""
2118-
2119-
# Remove port if present
2120-
domain = domain.split(":")[0]
2121-
2122-
# Remove www
2123-
domain = domain.removeprefix("www.")
2124-
2125-
port_suffix: str = ""
2126-
port = parsed.port
2127-
if port is not None and port != DEFAULT_PORTS.get(parsed.scheme):
2128-
# Port needed.
2129-
port_suffix = f":{port}"
2130-
2131-
2132-
# Extract last two parts of domain (handles co.uk etc)
2133-
parts = domain.split(".")
2134-
if len(parts) > 2 and parts[-2] in {
2135-
"co",
2136-
"com",
2137-
"org",
2138-
"gov",
2139-
"edu",
2140-
"net",
2141-
"mil",
2142-
"int",
2143-
"ac",
2144-
"ad",
2145-
"ae",
2146-
"af",
2147-
"ag",
2148-
}:
2149-
return ".".join(parts[-3:]) + port_suffix
2150-
2151-
return ".".join(parts[-2:]) + port_suffix
2144+
return get_base_domain_parsed(parsed)
21522145
except Exception:
21532146
return ""
21542147

21552148

2149+
def get_base_domain_parsed(parsed: ParseResult) -> str:
2150+
"""
2151+
Extract the base domain from the parsed URL, handling common edge cases.
2152+
2153+
How it works:
2154+
1. Parses the URL to extract the domain.
2155+
2. Removes the port number and 'www' prefix if necessary.
2156+
3. Handles special domains (e.g., 'co.uk') to extract the correct base.
2157+
2158+
Args:
2159+
parsed (str): The parsed URL to extract the base domain from.
2160+
2161+
Returns:
2162+
str: The extracted base domain or an empty string if netloc is empty.
2163+
"""
2164+
domain: str = parsed.netloc.lower()
2165+
if not domain:
2166+
return ""
2167+
2168+
# Remove port if present
2169+
domain = domain.split(":")[0]
2170+
2171+
# Remove www
2172+
domain = domain.removeprefix("www.")
2173+
2174+
port_suffix: str = ""
2175+
port = parsed.port
2176+
if port is not None and port != DEFAULT_PORTS.get(parsed.scheme):
2177+
# Port needed.
2178+
port_suffix = f":{port}"
2179+
2180+
# Extract last two parts of domain (handles co.uk etc)
2181+
parts = domain.split(".")
2182+
if len(parts) > 2 and parts[-2] in {
2183+
"co",
2184+
"com",
2185+
"org",
2186+
"gov",
2187+
"edu",
2188+
"net",
2189+
"mil",
2190+
"int",
2191+
"ac",
2192+
"ad",
2193+
"ae",
2194+
"af",
2195+
"ag",
2196+
}:
2197+
return ".".join(parts[-3:]) + port_suffix
2198+
2199+
return ".".join(parts[-2:]) + port_suffix
2200+
2201+
21562202
def is_external_url(url: str, base_domain: str) -> bool:
21572203
"""
21582204
Extract the base domain from a given URL, handling common edge cases.
@@ -2753,8 +2799,8 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
27532799
return result[:max_size] + "..."
27542800

27552801
return result
2756-
2757-
except Exception as e:
2802+
2803+
except Exception:
27582804
# Fallback for parsing errors
27592805
return html_content[:max_size] if len(html_content) > max_size else html_content
27602806

tests/20241401/test_deep_crawl.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,17 @@
11
import sys
22
import time
3+
import socket
34
from httpx import codes
45
import pytest
6+
import pytest_asyncio
57

68

79
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
810
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
911
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
1012

1113
from pytest_httpserver import HTTPServer
14+
from unittest.mock import patch
1215

1316

1417
URLS = [
@@ -18,6 +21,12 @@
1821
"/level2/article2",
1922
]
2023

24+
@pytest_asyncio.fixture
25+
def mock_dns():
26+
with patch('socket.gethostbyname') as mock_gethostbyname:
27+
mock_gethostbyname.side_effect = lambda host: socket.gethostbyname('localhost' if host == 'www.localhost' else host)
28+
yield
29+
2130
@pytest.fixture
2231
def site(httpserver: HTTPServer) -> HTTPServer:
2332
"""Fixture to serve multiple pages for a crawl."""
@@ -26,9 +35,11 @@ def site(httpserver: HTTPServer) -> HTTPServer:
2635
<a href="/level1">Go to level 1</a>
2736
</body></html>
2837
""")
29-
httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data="""
38+
httpserver.expect_request("/level1").respond_with_data(content_type="text/html", response_data=f"""
3039
<html><body>
31-
<a href="/level2/article1">Go to level 2 - Article 1</a>
40+
<a href="/level2/article1">Go to level 2 - Article 1 (relative)</a>
41+
<a href="{httpserver.url_for("/level2/article1")}">Go to level 2 - Article 1 (absolute)</a>
42+
<a href="{httpserver.url_for("/level2/article1").replace("localhost", "www.localhost")}">Go to level 2 - Article 1 (absolute + www prefix)</a>
3243
<a href="/level2/article2">Go to level 2 - Article 2</a>
3344
</body></html>
3445
""")
@@ -42,10 +53,12 @@ def site(httpserver: HTTPServer) -> HTTPServer:
4253
<p>This is level 2 - Article 2</p>
4354
</body></html>
4455
""")
56+
httpserver.expect_request("/favicon.ico").respond_with_data(status=codes.NOT_FOUND)
57+
4558
return httpserver
4659

4760
@pytest.mark.asyncio
48-
async def test_deep_crawl_batch(site: HTTPServer):
61+
async def test_deep_crawl_batch(site: HTTPServer, mock_dns):
4962
config = CrawlerRunConfig(
5063
deep_crawl_strategy = BFSDeepCrawlStrategy(
5164
max_depth=2,
@@ -73,8 +86,10 @@ async def test_deep_crawl_batch(site: HTTPServer):
7386
assert result.url == site.url_for(URLS[idx])
7487
assert result.status_code == codes.OK
7588

89+
site.check_assertions()
90+
7691
@pytest.mark.asyncio
77-
async def test_deep_crawl_stream(site: HTTPServer):
92+
async def test_deep_crawl_stream(site: HTTPServer, mock_dns):
7893
config = CrawlerRunConfig(
7994
deep_crawl_strategy = BFSDeepCrawlStrategy(
8095
max_depth=2,
@@ -106,6 +121,8 @@ async def test_deep_crawl_stream(site: HTTPServer):
106121
print(f"Crawled {idx} pages")
107122
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
108123

124+
site.check_assertions()
125+
109126
if __name__ == "__main__":
110127
import subprocess
111128

0 commit comments

Comments
 (0)