Skip to content

Commit 48c34ff

Browse files
committed
fix: tests and associated bugs
This is a placeholder which contains all the fixes and improvements needed to get all the existing tests running error free. There are a small number of tests which have been marked as skipped due to flaky behaviour or significant functionality needed to make them work. The intention is to split this out into many more consumable changes. This should not be merged as is.
1 parent 6eed4ad commit 48c34ff

File tree

108 files changed

+5595
-4169
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

108 files changed

+5595
-4169
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ coverage.xml
5050
.hypothesis/
5151
.pytest_cache/
5252
cover/
53+
tests/async/output/
5354

5455
# Translations
5556
*.mo

crawl4ai/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# __init__.py
22
import warnings
3+
from logging import Logger
34

45
from .async_webcrawler import AsyncWebCrawler, CacheMode
56
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
@@ -64,6 +65,7 @@
6465
DFSDeepCrawlStrategy,
6566
DeepCrawlDecorator,
6667
)
68+
from .deep_crawling.scorers import ScoringStats
6769

6870
__all__ = [
6971
"AsyncLoggerBase",
@@ -120,6 +122,8 @@
120122
"Crawl4aiDockerClient",
121123
"ProxyRotationStrategy",
122124
"RoundRobinProxyStrategy",
125+
"ScoringStats",
126+
"Logger", # Required for serialization
123127
]
124128

125129

crawl4ai/async_configs.py

Lines changed: 59 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,18 @@
1919
from .deep_crawling import DeepCrawlStrategy
2020

2121
from .cache_context import CacheMode
22-
from .proxy_strategy import ProxyRotationStrategy
22+
from .proxy_strategy import ProxyRotationStrategy, ProxyConfig
2323

2424
from typing import Union, List
2525
import inspect
2626
from typing import Any, Dict, Optional
2727
from enum import Enum
28+
from pathlib import Path
2829

30+
Serialisable = Optional[Union[str, int, float, bool, List, Dict]]
2931

30-
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
32+
33+
def to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Serialisable:
3134
"""
3235
Recursively convert an object to a serializable dictionary using {type, params} structure
3336
for complex objects.
@@ -101,7 +104,7 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
101104
return str(obj)
102105

103106

104-
def from_serializable_dict(data: Any) -> Any:
107+
def from_serializable(data: Serialisable) -> Any:
105108
"""
106109
Recursively convert a serializable dictionary back to an object instance.
107110
"""
@@ -116,7 +119,7 @@ def from_serializable_dict(data: Any) -> Any:
116119
if isinstance(data, dict) and "type" in data:
117120
# Handle plain dictionaries
118121
if data["type"] == "dict":
119-
return {k: from_serializable_dict(v) for k, v in data["value"].items()}
122+
return {k: from_serializable(v) for k, v in data["value"].items()}
120123

121124
# Import from crawl4ai for class instances
122125
import crawl4ai
@@ -128,18 +131,16 @@ def from_serializable_dict(data: Any) -> Any:
128131
return cls(data["params"])
129132

130133
# Handle class instances
131-
constructor_args = {
132-
k: from_serializable_dict(v) for k, v in data["params"].items()
133-
}
134+
constructor_args = {k: from_serializable(v) for k, v in data["params"].items()}
134135
return cls(**constructor_args)
135136

136137
# Handle lists
137138
if isinstance(data, list):
138-
return [from_serializable_dict(item) for item in data]
139+
return [from_serializable(item) for item in data]
139140

140141
# Handle raw dictionaries (legacy support)
141142
if isinstance(data, dict):
142-
return {k: from_serializable_dict(v) for k, v in data.items()}
143+
return {k: from_serializable(v) for k, v in data.items()}
143144

144145
return data
145146

@@ -190,7 +191,7 @@ class BrowserConfig:
190191
Default: True.
191192
accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
192193
Default: False.
193-
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
194+
downloads_path (Path or str or None): Directory to store downloaded files. If None and accept_downloads is True,
194195
a default path will be created. Default: None.
195196
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
196197
Default: None.
@@ -219,25 +220,25 @@ def __init__(
219220
browser_type: str = "chromium",
220221
headless: bool = True,
221222
use_managed_browser: bool = False,
222-
cdp_url: str = None,
223+
cdp_url: Optional[str] = None,
223224
use_persistent_context: bool = False,
224-
user_data_dir: str = None,
225+
user_data_dir: Optional[str] = None,
225226
chrome_channel: str = "chromium",
226227
channel: str = "chromium",
227-
proxy: str = None,
228-
proxy_config: dict = None,
228+
proxy: Optional[str] = None,
229+
proxy_config: Optional[dict] = None,
229230
viewport_width: int = 1080,
230231
viewport_height: int = 600,
231-
viewport: dict = None,
232+
viewport: Optional[dict] = None,
232233
accept_downloads: bool = False,
233-
downloads_path: str = None,
234+
downloads_path: Optional[Union[Path, str]] = None,
234235
storage_state: Union[str, dict, None] = None,
235236
ignore_https_errors: bool = True,
236237
java_script_enabled: bool = True,
237238
sleep_on_close: bool = False,
238239
verbose: bool = True,
239-
cookies: list = None,
240-
headers: dict = None,
240+
cookies: Optional[list] = None,
241+
headers: Optional[dict] = None,
241242
user_agent: str = (
242243
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
243244
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
@@ -248,7 +249,7 @@ def __init__(
248249
user_agent_generator_config: dict = {},
249250
text_mode: bool = False,
250251
light_mode: bool = False,
251-
extra_args: list = None,
252+
extra_args: Optional[list] = None,
252253
debugging_port: int = 9222,
253254
host: str = "localhost",
254255
):
@@ -330,8 +331,8 @@ def from_kwargs(kwargs: dict) -> "BrowserConfig":
330331
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
331332
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
332333
),
333-
user_agent_mode=kwargs.get("user_agent_mode"),
334-
user_agent_generator_config=kwargs.get("user_agent_generator_config"),
334+
user_agent_mode=kwargs.get("user_agent_mode", ""),
335+
user_agent_generator_config=kwargs.get("user_agent_generator_config", ""),
335336
text_mode=kwargs.get("text_mode", False),
336337
light_mode=kwargs.get("light_mode", False),
337338
extra_args=kwargs.get("extra_args", []),
@@ -382,17 +383,21 @@ def clone(self, **kwargs):
382383
config_dict.update(kwargs)
383384
return BrowserConfig.from_kwargs(config_dict)
384385

385-
# Create a funciton returns dict of the object
386-
def dump(self) -> dict:
386+
# Create a function returns dict of the object
387+
def dump(self) -> Serialisable:
387388
# Serialize the object to a dictionary
388389
return to_serializable_dict(self)
389390

390391
@staticmethod
391-
def load(data: dict) -> "BrowserConfig":
392+
def load(data: Serialisable) -> "BrowserConfig":
393+
if data is None:
394+
return BrowserConfig()
395+
392396
# Deserialize the object from a dictionary
393-
config = from_serializable_dict(data)
397+
config = from_serializable(data)
394398
if isinstance(config, BrowserConfig):
395399
return config
400+
396401
return BrowserConfig.from_kwargs(config)
397402

398403

@@ -456,12 +461,12 @@ def clone(self, **kwargs):
456461
config_dict.update(kwargs)
457462
return HTTPCrawlerConfig.from_kwargs(config_dict)
458463

459-
def dump(self) -> dict:
464+
def dump(self) -> Serialisable:
460465
return to_serializable_dict(self)
461466

462467
@staticmethod
463468
def load(data: dict) -> "HTTPCrawlerConfig":
464-
config = from_serializable_dict(data)
469+
config = from_serializable(data)
465470
if isinstance(config, HTTPCrawlerConfig):
466471
return config
467472
return HTTPCrawlerConfig.from_kwargs(config)
@@ -636,49 +641,49 @@ class CrawlerRunConfig():
636641
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
637642
Default: None.
638643
639-
url: str = None # This is not a compulsory parameter
644+
url (str or None): This is not a compulsory parameter
640645
"""
641646

642647
def __init__(
643648
self,
644649
# Content Processing Parameters
645650
word_count_threshold: int = MIN_WORD_THRESHOLD,
646-
extraction_strategy: ExtractionStrategy = None,
651+
extraction_strategy: Optional[ExtractionStrategy] = None,
647652
chunking_strategy: ChunkingStrategy = RegexChunking(),
648-
markdown_generator: MarkdownGenerationStrategy = None,
653+
markdown_generator: Optional[MarkdownGenerationStrategy] = None,
649654
only_text: bool = False,
650-
css_selector: str = None,
651-
excluded_tags: list = None,
652-
excluded_selector: str = None,
655+
css_selector: Optional[str] = None,
656+
excluded_tags: Optional[list] = None,
657+
excluded_selector: Optional[str] = None,
653658
keep_data_attributes: bool = False,
654-
keep_attrs: list = None,
659+
keep_attrs: Optional[list] = None,
655660
remove_forms: bool = False,
656661
prettiify: bool = False,
657662
parser_type: str = "lxml",
658-
scraping_strategy: ContentScrapingStrategy = None,
659-
proxy_config: dict = None,
663+
scraping_strategy: Optional[ContentScrapingStrategy] = None,
664+
proxy_config: Optional[ProxyConfig] = None,
660665
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
661666
# SSL Parameters
662667
fetch_ssl_certificate: bool = False,
663668
# Caching Parameters
664669
cache_mode: CacheMode = CacheMode.BYPASS,
665-
session_id: str = None,
670+
session_id: Optional[str] = None,
666671
bypass_cache: bool = False,
667672
disable_cache: bool = False,
668673
no_cache_read: bool = False,
669674
no_cache_write: bool = False,
670-
shared_data: dict = None,
675+
shared_data: Optional[dict] = None,
671676
# Page Navigation and Timing Parameters
672677
wait_until: str = "domcontentloaded",
673678
page_timeout: int = PAGE_TIMEOUT,
674-
wait_for: str = None,
679+
wait_for: Optional[str] = None,
675680
wait_for_images: bool = False,
676681
delay_before_return_html: float = 0.1,
677682
mean_delay: float = 0.1,
678683
max_range: float = 0.3,
679684
semaphore_count: int = 5,
680685
# Page Interaction Parameters
681-
js_code: Union[str, List[str]] = None,
686+
js_code: Optional[Union[str, List[str]]] = None,
682687
js_only: bool = False,
683688
ignore_body_visibility: bool = True,
684689
scan_full_page: bool = False,
@@ -691,28 +696,28 @@ def __init__(
691696
adjust_viewport_to_content: bool = False,
692697
# Media Handling Parameters
693698
screenshot: bool = False,
694-
screenshot_wait_for: float = None,
699+
screenshot_wait_for: Optional[float] = None,
695700
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_TRESHOLD,
696701
pdf: bool = False,
697702
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
698703
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
699704
exclude_external_images: bool = False,
700705
# Link and Domain Handling Parameters
701-
exclude_social_media_domains: list = None,
706+
exclude_social_media_domains: Optional[list] = None,
702707
exclude_external_links: bool = False,
703708
exclude_social_media_links: bool = False,
704-
exclude_domains: list = None,
709+
exclude_domains: Optional[list] = None,
705710
exclude_internal_links: bool = False,
706711
# Debugging and Logging Parameters
707712
verbose: bool = True,
708713
log_console: bool = False,
709714
# Connection Parameters
710715
method: str = "GET",
711716
stream: bool = False,
712-
url: str = None,
717+
url: Optional[str] = None,
713718
check_robots_txt: bool = False,
714-
user_agent: str = None,
715-
user_agent_mode: str = None,
719+
user_agent: Optional[str] = None,
720+
user_agent_mode: Optional[str] = None,
716721
user_agent_generator_config: dict = {},
717722
# Deep Crawl Parameters
718723
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
@@ -935,15 +940,18 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
935940
url=kwargs.get("url"),
936941
)
937942

938-
# Create a funciton returns dict of the object
939-
def dump(self) -> dict:
943+
# Create a function returns dict of the object
944+
def dump(self) -> Serialisable:
940945
# Serialize the object to a dictionary
941946
return to_serializable_dict(self)
942947

943948
@staticmethod
944-
def load(data: dict) -> "CrawlerRunConfig":
949+
def load(data: Serialisable) -> "CrawlerRunConfig":
950+
if data is None:
951+
return CrawlerRunConfig()
952+
945953
# Deserialize the object from a dictionary
946-
config = from_serializable_dict(data)
954+
config = from_serializable(data)
947955
if isinstance(config, CrawlerRunConfig):
948956
return config
949957
return CrawlerRunConfig.from_kwargs(config)
@@ -1051,7 +1059,7 @@ def __init__(
10511059
api_token: Optional[str] = None,
10521060
base_url: Optional[str] = None,
10531061
):
1054-
"""Configuaration class for LLM provider and API token."""
1062+
"""Configuration class for LLM provider and API token."""
10551063
self.provider = provider
10561064
if api_token and not api_token.startswith("env:"):
10571065
self.api_token = api_token

0 commit comments

Comments
 (0)