99from collections import deque
1010from datetime import datetime , timezone
1111from pathlib import Path
12- from typing import Callable , Optional
12+ from typing import Optional
1313from urllib .parse import urljoin , urlparse , urlunparse
1414
1515import aiohttp
1616from bs4 import BeautifulSoup
1717from markdownify import markdownify
1818from tqdm .asyncio import tqdm
1919
20+ from cairo_coder_tools .ingestion .web_targets import IWebsiteTarget
21+
2022# Configuration
21- UA = "NotebookLM -prep-crawler/1.1 (+contact: [email protected] ) " 23+ UA = "starknet -prep-crawler"
2224OUT_FILE = Path ("doc_dump" )
23- CONCURRENCY = 4 # Low concurrency to avoid rate limits
24- MAX_RETRIES = 5
25+ CONCURRENCY = 4
26+ MAX_RETRIES = 6
2527TIMEOUT = 30
2628MAX_CRAWL_PAGES = 100
2729
4951
5052
5153class DocsCrawler :
52- """Web crawler for documentation sites with filtering capabilities."""
53-
54- def __init__ (
55- self ,
56- base_url : str ,
57- include_patterns : Optional [list [str ]] = None ,
58- exclude_url_patterns : Optional [list [str ]] = None ,
59- content_filter : Optional [Callable [[str ], bool ]] = None ,
60- content_processor : Optional [Callable [[str ], str ]] = None ,
61- ):
54+ """Web crawler for documentation sites with filtering capabilities.
55+
56+ Uses an IWebsiteTarget object to define crawling behavior.
57+ """
58+
59+ def __init__ (self , target : "IWebsiteTarget" ):
6260 """Initialize the crawler.
6361
6462 Args:
65- base_url: Base URL to start crawling from
66- include_patterns: Optional list of regex patterns for URLs to include
67- exclude_url_patterns: Optional list of regex patterns for URLs to exclude
68- content_filter: Optional function that returns True if content should be kept
69- content_processor: Optional function to post-process content (e.g., remove sections)
63+ target: IWebsiteTarget object defining crawling configuration and behavior
7064 """
71- self .base_url = base_url .rstrip ('/' ) + '/'
65+ self .target = target
66+ self .base_url = target .base_url .rstrip ('/' ) + '/'
7267 self .domain = urlparse (self .base_url ).netloc
7368 self .discovered_urls : list [str ] = []
7469 self .fetched_pages : dict [str , dict ] = {}
7570 self .session : Optional [aiohttp .ClientSession ] = None
7671 self .semaphore = asyncio .Semaphore (CONCURRENCY )
77- self .include_patterns = include_patterns or []
78- self .exclude_url_patterns = exclude_url_patterns or []
79- self .content_filter = content_filter
80- self .content_processor = content_processor
8172
8273 async def __aenter__ (self ):
8374 timeout = aiohttp .ClientTimeout (total = TIMEOUT )
@@ -115,7 +106,7 @@ def is_valid_url(self, url: str) -> bool:
115106 return not any (re .search (pattern , path , re .IGNORECASE ) for pattern in EXCLUDE_PATTERNS )
116107
117108 def filter_urls (self , urls : list [str ]) -> list [str ]:
118- """Filter URLs based on include/exclude patterns.
109+ """Filter URLs based on include/exclude patterns from the target .
119110
120111 This is applied AFTER discovery and BEFORE fetching.
121112
@@ -126,20 +117,18 @@ def filter_urls(self, urls: list[str]) -> list[str]:
126117 Filtered list of URLs
127118 """
128119 filtered_urls = []
120+ include_patterns = self .target .get_include_url_patterns ()
121+ exclude_patterns = self .target .get_exclude_url_patterns ()
129122
130123 for url in urls :
131124 parsed = urlparse (url )
132125 path = parsed .path
133126
134- # Check include patterns if provided
135- if self .include_patterns :
136- if not any (re .search (pattern , path , re .IGNORECASE ) for pattern in self .include_patterns ):
137- continue
127+ if include_patterns and not any (re .search (pattern , path , re .IGNORECASE ) for pattern in include_patterns ):
128+ continue
138129
139- # Check custom exclude patterns (user-provided)
140- if self .exclude_url_patterns :
141- if any (re .search (pattern , path , re .IGNORECASE ) for pattern in self .exclude_url_patterns ):
142- continue
130+ if exclude_patterns and any (re .search (pattern , path , re .IGNORECASE ) for pattern in exclude_patterns ):
131+ continue
143132
144133 filtered_urls .append (url )
145134
@@ -301,6 +290,7 @@ async def fetch_page(self, url: str) -> dict:
301290 logger .debug (f"Got { response .status } for { url } , retrying in { wait_time } s (attempt { attempt + 1 } /{ MAX_RETRIES } )" )
302291 await asyncio .sleep (wait_time )
303292 continue
293+ logger .debug (f"Failed to fetch { url } after { MAX_RETRIES } attempts: { last_error } " )
304294
305295 # For other non-200 statuses, return immediately (no retry)
306296 return {
@@ -429,14 +419,13 @@ def compile_markdown(self) -> str:
429419 if page_data .get ('content' ):
430420 title , markdown = self .extract_content (page_data ['content' ], url )
431421
432- # Apply content filter if provided
433- if self . content_filter and not self .content_filter (markdown ):
422+ # Apply content filter from target
423+ if not self .target . filter_content (markdown ):
434424 filtered_out += 1
435425 continue
436426
437- # Apply content processor if provided (e.g., remove unwanted sections)
438- if self .content_processor :
439- markdown = self .content_processor (markdown )
427+ # Apply content processor from target (e.g., remove unwanted sections)
428+ markdown = self .target .process_content (markdown )
440429
441430 if not markdown or len (markdown .strip ()) < 50 :
442431 markdown = "*No content extracted.*"
@@ -458,7 +447,7 @@ def compile_markdown(self) -> str:
458447 error = page_data .get ('error' , 'Unknown error' )
459448 logger .info (f"Skipping { url } : { error } " )
460449
461- logger .info (f"Filtered out { filtered_out } pages based on content filter: { self . content_filter } " )
450+ logger .info (f"Filtered out { filtered_out } pages based on content filter" )
462451 return '\n ' .join (lines )
463452
464453 async def run (self , output_path : Optional [Path ] = None ) -> Path :
@@ -510,10 +499,7 @@ async def run(self, output_path: Optional[Path] = None) -> Path:
510499 markdown_content = self .compile_markdown ()
511500
512501 # Save markdown
513- if output_path is None :
514- output_path = OUT_FILE .with_suffix ('.md' )
515- else :
516- output_path = Path (output_path )
502+ output_path = OUT_FILE .with_suffix ('.md' ) if output_path is None else Path (output_path )
517503
518504 logger .info (f"Saving markdown to: { output_path } " )
519505 output_path .write_text (markdown_content , encoding = 'utf-8' )
0 commit comments