1919from .deep_crawling import DeepCrawlStrategy
2020
2121from .cache_context import CacheMode
22- from .proxy_strategy import ProxyRotationStrategy
22+ from .proxy_strategy import ProxyRotationStrategy , ProxyConfig
2323
2424from typing import Union , List
2525import inspect
2626from typing import Any , Dict , Optional
2727from enum import Enum
28+ from pathlib import Path
2829
30+ Serialisable = Optional [Union [str , int , float , bool , List , Dict ]]
2931
30- def to_serializable_dict (obj : Any , ignore_default_value : bool = False ) -> Dict :
32+
33+ def to_serializable_dict (obj : Any , ignore_default_value : bool = False ) -> Serialisable :
3134 """
3235 Recursively convert an object to a serializable dictionary using {type, params} structure
3336 for complex objects.
@@ -101,7 +104,7 @@ def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
101104 return str (obj )
102105
103106
104- def from_serializable_dict (data : Any ) -> Any :
107+ def from_serializable (data : Serialisable ) -> Any :
105108 """
106109 Recursively convert a serializable dictionary back to an object instance.
107110 """
@@ -116,7 +119,7 @@ def from_serializable_dict(data: Any) -> Any:
116119 if isinstance (data , dict ) and "type" in data :
117120 # Handle plain dictionaries
118121 if data ["type" ] == "dict" :
119- return {k : from_serializable_dict (v ) for k , v in data ["value" ].items ()}
122+ return {k : from_serializable (v ) for k , v in data ["value" ].items ()}
120123
121124 # Import from crawl4ai for class instances
122125 import crawl4ai
@@ -128,18 +131,16 @@ def from_serializable_dict(data: Any) -> Any:
128131 return cls (data ["params" ])
129132
130133 # Handle class instances
131- constructor_args = {
132- k : from_serializable_dict (v ) for k , v in data ["params" ].items ()
133- }
134+ constructor_args = {k : from_serializable (v ) for k , v in data ["params" ].items ()}
134135 return cls (** constructor_args )
135136
136137 # Handle lists
137138 if isinstance (data , list ):
138- return [from_serializable_dict (item ) for item in data ]
139+ return [from_serializable (item ) for item in data ]
139140
140141 # Handle raw dictionaries (legacy support)
141142 if isinstance (data , dict ):
142- return {k : from_serializable_dict (v ) for k , v in data .items ()}
143+ return {k : from_serializable (v ) for k , v in data .items ()}
143144
144145 return data
145146
@@ -190,7 +191,7 @@ class BrowserConfig:
190191 Default: True.
191192 accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path.
192193 Default: False.
193- downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
194+ downloads_path (Path or str or None): Directory to store downloaded files. If None and accept_downloads is True,
194195 a default path will be created. Default: None.
195196 storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
196197 Default: None.
@@ -219,25 +220,25 @@ def __init__(
219220 browser_type : str = "chromium" ,
220221 headless : bool = True ,
221222 use_managed_browser : bool = False ,
222- cdp_url : str = None ,
223+ cdp_url : Optional [ str ] = None ,
223224 use_persistent_context : bool = False ,
224- user_data_dir : str = None ,
225+ user_data_dir : Optional [ str ] = None ,
225226 chrome_channel : str = "chromium" ,
226227 channel : str = "chromium" ,
227- proxy : str = None ,
228- proxy_config : dict = None ,
228+ proxy : Optional [ str ] = None ,
229+ proxy_config : Optional [ dict ] = None ,
229230 viewport_width : int = 1080 ,
230231 viewport_height : int = 600 ,
231- viewport : dict = None ,
232+ viewport : Optional [ dict ] = None ,
232233 accept_downloads : bool = False ,
233- downloads_path : str = None ,
234+ downloads_path : Optional [ Union [ Path , str ]] = None ,
234235 storage_state : Union [str , dict , None ] = None ,
235236 ignore_https_errors : bool = True ,
236237 java_script_enabled : bool = True ,
237238 sleep_on_close : bool = False ,
238239 verbose : bool = True ,
239- cookies : list = None ,
240- headers : dict = None ,
240+ cookies : Optional [ list ] = None ,
241+ headers : Optional [ dict ] = None ,
241242 user_agent : str = (
242243 # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
243244 # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
@@ -248,7 +249,7 @@ def __init__(
248249 user_agent_generator_config : dict = {},
249250 text_mode : bool = False ,
250251 light_mode : bool = False ,
251- extra_args : list = None ,
252+ extra_args : Optional [ list ] = None ,
252253 debugging_port : int = 9222 ,
253254 host : str = "localhost" ,
254255 ):
@@ -330,8 +331,8 @@ def from_kwargs(kwargs: dict) -> "BrowserConfig":
330331 "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
331332 "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36" ,
332333 ),
333- user_agent_mode = kwargs .get ("user_agent_mode" ),
334- user_agent_generator_config = kwargs .get ("user_agent_generator_config" ),
334+ user_agent_mode = kwargs .get ("user_agent_mode" , "" ),
335+ user_agent_generator_config = kwargs .get ("user_agent_generator_config" , "" ),
335336 text_mode = kwargs .get ("text_mode" , False ),
336337 light_mode = kwargs .get ("light_mode" , False ),
337338 extra_args = kwargs .get ("extra_args" , []),
@@ -382,17 +383,21 @@ def clone(self, **kwargs):
382383 config_dict .update (kwargs )
383384 return BrowserConfig .from_kwargs (config_dict )
384385
385- # Create a funciton returns dict of the object
386- def dump (self ) -> dict :
386+ # Create a function returns dict of the object
387+ def dump (self ) -> Serialisable :
387388 # Serialize the object to a dictionary
388389 return to_serializable_dict (self )
389390
390391 @staticmethod
391- def load (data : dict ) -> "BrowserConfig" :
392+ def load (data : Serialisable ) -> "BrowserConfig" :
393+ if data is None :
394+ return BrowserConfig ()
395+
392396 # Deserialize the object from a dictionary
393- config = from_serializable_dict (data )
397+ config = from_serializable (data )
394398 if isinstance (config , BrowserConfig ):
395399 return config
400+
396401 return BrowserConfig .from_kwargs (config )
397402
398403
@@ -456,12 +461,12 @@ def clone(self, **kwargs):
456461 config_dict .update (kwargs )
457462 return HTTPCrawlerConfig .from_kwargs (config_dict )
458463
459- def dump (self ) -> dict :
464+ def dump (self ) -> Serialisable :
460465 return to_serializable_dict (self )
461466
462467 @staticmethod
463468 def load (data : dict ) -> "HTTPCrawlerConfig" :
464- config = from_serializable_dict (data )
469+ config = from_serializable (data )
465470 if isinstance (config , HTTPCrawlerConfig ):
466471 return config
467472 return HTTPCrawlerConfig .from_kwargs (config )
@@ -636,49 +641,49 @@ class CrawlerRunConfig():
636641 user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
637642 Default: None.
638643
639- url: str = None # This is not a compulsory parameter
644+ url ( str or None): This is not a compulsory parameter
640645 """
641646
642647 def __init__ (
643648 self ,
644649 # Content Processing Parameters
645650 word_count_threshold : int = MIN_WORD_THRESHOLD ,
646- extraction_strategy : ExtractionStrategy = None ,
651+ extraction_strategy : Optional [ ExtractionStrategy ] = None ,
647652 chunking_strategy : ChunkingStrategy = RegexChunking (),
648- markdown_generator : MarkdownGenerationStrategy = None ,
653+ markdown_generator : Optional [ MarkdownGenerationStrategy ] = None ,
649654 only_text : bool = False ,
650- css_selector : str = None ,
651- excluded_tags : list = None ,
652- excluded_selector : str = None ,
655+ css_selector : Optional [ str ] = None ,
656+ excluded_tags : Optional [ list ] = None ,
657+ excluded_selector : Optional [ str ] = None ,
653658 keep_data_attributes : bool = False ,
654- keep_attrs : list = None ,
659+ keep_attrs : Optional [ list ] = None ,
655660 remove_forms : bool = False ,
656661 prettiify : bool = False ,
657662 parser_type : str = "lxml" ,
658- scraping_strategy : ContentScrapingStrategy = None ,
659- proxy_config : dict = None ,
663+ scraping_strategy : Optional [ ContentScrapingStrategy ] = None ,
664+ proxy_config : Optional [ ProxyConfig ] = None ,
660665 proxy_rotation_strategy : Optional [ProxyRotationStrategy ] = None ,
661666 # SSL Parameters
662667 fetch_ssl_certificate : bool = False ,
663668 # Caching Parameters
664669 cache_mode : CacheMode = CacheMode .BYPASS ,
665- session_id : str = None ,
670+ session_id : Optional [ str ] = None ,
666671 bypass_cache : bool = False ,
667672 disable_cache : bool = False ,
668673 no_cache_read : bool = False ,
669674 no_cache_write : bool = False ,
670- shared_data : dict = None ,
675+ shared_data : Optional [ dict ] = None ,
671676 # Page Navigation and Timing Parameters
672677 wait_until : str = "domcontentloaded" ,
673678 page_timeout : int = PAGE_TIMEOUT ,
674- wait_for : str = None ,
679+ wait_for : Optional [ str ] = None ,
675680 wait_for_images : bool = False ,
676681 delay_before_return_html : float = 0.1 ,
677682 mean_delay : float = 0.1 ,
678683 max_range : float = 0.3 ,
679684 semaphore_count : int = 5 ,
680685 # Page Interaction Parameters
681- js_code : Union [str , List [str ]] = None ,
686+ js_code : Optional [ Union [str , List [str ] ]] = None ,
682687 js_only : bool = False ,
683688 ignore_body_visibility : bool = True ,
684689 scan_full_page : bool = False ,
@@ -691,28 +696,28 @@ def __init__(
691696 adjust_viewport_to_content : bool = False ,
692697 # Media Handling Parameters
693698 screenshot : bool = False ,
694- screenshot_wait_for : float = None ,
699+ screenshot_wait_for : Optional [ float ] = None ,
695700 screenshot_height_threshold : int = SCREENSHOT_HEIGHT_TRESHOLD ,
696701 pdf : bool = False ,
697702 image_description_min_word_threshold : int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD ,
698703 image_score_threshold : int = IMAGE_SCORE_THRESHOLD ,
699704 exclude_external_images : bool = False ,
700705 # Link and Domain Handling Parameters
701- exclude_social_media_domains : list = None ,
706+ exclude_social_media_domains : Optional [ list ] = None ,
702707 exclude_external_links : bool = False ,
703708 exclude_social_media_links : bool = False ,
704- exclude_domains : list = None ,
709+ exclude_domains : Optional [ list ] = None ,
705710 exclude_internal_links : bool = False ,
706711 # Debugging and Logging Parameters
707712 verbose : bool = True ,
708713 log_console : bool = False ,
709714 # Connection Parameters
710715 method : str = "GET" ,
711716 stream : bool = False ,
712- url : str = None ,
717+ url : Optional [ str ] = None ,
713718 check_robots_txt : bool = False ,
714- user_agent : str = None ,
715- user_agent_mode : str = None ,
719+ user_agent : Optional [ str ] = None ,
720+ user_agent_mode : Optional [ str ] = None ,
716721 user_agent_generator_config : dict = {},
717722 # Deep Crawl Parameters
718723 deep_crawl_strategy : Optional [DeepCrawlStrategy ] = None ,
@@ -935,15 +940,18 @@ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
935940 url = kwargs .get ("url" ),
936941 )
937942
938- # Create a funciton returns dict of the object
939- def dump (self ) -> dict :
943+ # Create a function returns dict of the object
944+ def dump (self ) -> Serialisable :
940945 # Serialize the object to a dictionary
941946 return to_serializable_dict (self )
942947
943948 @staticmethod
944- def load (data : dict ) -> "CrawlerRunConfig" :
949+ def load (data : Serialisable ) -> "CrawlerRunConfig" :
950+ if data is None :
951+ return CrawlerRunConfig ()
952+
945953 # Deserialize the object from a dictionary
946- config = from_serializable_dict (data )
954+ config = from_serializable (data )
947955 if isinstance (config , CrawlerRunConfig ):
948956 return config
949957 return CrawlerRunConfig .from_kwargs (config )
@@ -1051,7 +1059,7 @@ def __init__(
10511059 api_token : Optional [str ] = None ,
10521060 base_url : Optional [str ] = None ,
10531061 ):
1054- """Configuaration class for LLM provider and API token."""
1062+ """Configuration class for LLM provider and API token."""
10551063 self .provider = provider
10561064 if api_token and not api_token .startswith ("env:" ):
10571065 self .api_token = api_token
0 commit comments