Skip to content

Commit 886d2cc

Browse files
committed
better org of web target crawl
1 parent 7794ccf commit 886d2cc

File tree

9 files changed

+338
-417
lines changed

9 files changed

+338
-417
lines changed

.trunk/trunk.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@ runtimes:
1616
1717
# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration)
1818
lint:
19+
ignore:
20+
- linters: [ALL]
21+
paths:
22+
- python/src/cairo_coder_tools/ingestion/generated
1923
enabled:
2024
2125

ingesters/src/ingesters/StarknetBlogIngester.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ export class StarknetBlogIngester extends MarkdownIngester {
5050
'cairo_coder_tools',
5151
'ingestion',
5252
'generated',
53-
'blog_summary.md',
53+
'starknet-blog.md',
5454
);
5555

5656
logger.info(`Reading Starknet blog summary from ${summaryPath}`);

python/src/cairo_coder_tools/datasets/analysis.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
import json
77
from pathlib import Path
8-
from typing import Any
98

109
import dspy
1110
from dspy.adapters.baml_adapter import BAMLAdapter

python/src/cairo_coder_tools/ingestion/crawler.py

Lines changed: 29 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,21 @@
99
from collections import deque
1010
from datetime import datetime, timezone
1111
from pathlib import Path
12-
from typing import Callable, Optional
12+
from typing import Optional
1313
from urllib.parse import urljoin, urlparse, urlunparse
1414

1515
import aiohttp
1616
from bs4 import BeautifulSoup
1717
from markdownify import markdownify
1818
from tqdm.asyncio import tqdm
1919

20+
from cairo_coder_tools.ingestion.web_targets import IWebsiteTarget
21+
2022
# Configuration
21-
UA = "NotebookLM-prep-crawler/1.1 (+contact: [email protected])"
23+
UA = "starknet-prep-crawler"
2224
OUT_FILE = Path("doc_dump")
23-
CONCURRENCY = 4 # Low concurrency to avoid rate limits
24-
MAX_RETRIES = 5
25+
CONCURRENCY = 4
26+
MAX_RETRIES = 6
2527
TIMEOUT = 30
2628
MAX_CRAWL_PAGES = 100
2729

@@ -49,35 +51,24 @@
4951

5052

5153
class DocsCrawler:
52-
"""Web crawler for documentation sites with filtering capabilities."""
53-
54-
def __init__(
55-
self,
56-
base_url: str,
57-
include_patterns: Optional[list[str]] = None,
58-
exclude_url_patterns: Optional[list[str]] = None,
59-
content_filter: Optional[Callable[[str], bool]] = None,
60-
content_processor: Optional[Callable[[str], str]] = None,
61-
):
54+
"""Web crawler for documentation sites with filtering capabilities.
55+
56+
Uses an IWebsiteTarget object to define crawling behavior.
57+
"""
58+
59+
def __init__(self, target: "IWebsiteTarget"):
6260
"""Initialize the crawler.
6361
6462
Args:
65-
base_url: Base URL to start crawling from
66-
include_patterns: Optional list of regex patterns for URLs to include
67-
exclude_url_patterns: Optional list of regex patterns for URLs to exclude
68-
content_filter: Optional function that returns True if content should be kept
69-
content_processor: Optional function to post-process content (e.g., remove sections)
63+
target: IWebsiteTarget object defining crawling configuration and behavior
7064
"""
71-
self.base_url = base_url.rstrip('/') + '/'
65+
self.target = target
66+
self.base_url = target.base_url.rstrip('/') + '/'
7267
self.domain = urlparse(self.base_url).netloc
7368
self.discovered_urls: list[str] = []
7469
self.fetched_pages: dict[str, dict] = {}
7570
self.session: Optional[aiohttp.ClientSession] = None
7671
self.semaphore = asyncio.Semaphore(CONCURRENCY)
77-
self.include_patterns = include_patterns or []
78-
self.exclude_url_patterns = exclude_url_patterns or []
79-
self.content_filter = content_filter
80-
self.content_processor = content_processor
8172

8273
async def __aenter__(self):
8374
timeout = aiohttp.ClientTimeout(total=TIMEOUT)
@@ -115,7 +106,7 @@ def is_valid_url(self, url: str) -> bool:
115106
return not any(re.search(pattern, path, re.IGNORECASE) for pattern in EXCLUDE_PATTERNS)
116107

117108
def filter_urls(self, urls: list[str]) -> list[str]:
118-
"""Filter URLs based on include/exclude patterns.
109+
"""Filter URLs based on include/exclude patterns from the target.
119110
120111
This is applied AFTER discovery and BEFORE fetching.
121112
@@ -126,20 +117,18 @@ def filter_urls(self, urls: list[str]) -> list[str]:
126117
Filtered list of URLs
127118
"""
128119
filtered_urls = []
120+
include_patterns = self.target.get_include_url_patterns()
121+
exclude_patterns = self.target.get_exclude_url_patterns()
129122

130123
for url in urls:
131124
parsed = urlparse(url)
132125
path = parsed.path
133126

134-
# Check include patterns if provided
135-
if self.include_patterns:
136-
if not any(re.search(pattern, path, re.IGNORECASE) for pattern in self.include_patterns):
137-
continue
127+
if include_patterns and not any(re.search(pattern, path, re.IGNORECASE) for pattern in include_patterns):
128+
continue
138129

139-
# Check custom exclude patterns (user-provided)
140-
if self.exclude_url_patterns:
141-
if any(re.search(pattern, path, re.IGNORECASE) for pattern in self.exclude_url_patterns):
142-
continue
130+
if exclude_patterns and any(re.search(pattern, path, re.IGNORECASE) for pattern in exclude_patterns):
131+
continue
143132

144133
filtered_urls.append(url)
145134

@@ -301,6 +290,7 @@ async def fetch_page(self, url: str) -> dict:
301290
logger.debug(f"Got {response.status} for {url}, retrying in {wait_time}s (attempt {attempt + 1}/{MAX_RETRIES})")
302291
await asyncio.sleep(wait_time)
303292
continue
293+
logger.debug(f"Failed to fetch {url} after {MAX_RETRIES} attempts: {last_error}")
304294

305295
# For other non-200 statuses, return immediately (no retry)
306296
return {
@@ -429,14 +419,13 @@ def compile_markdown(self) -> str:
429419
if page_data.get('content'):
430420
title, markdown = self.extract_content(page_data['content'], url)
431421

432-
# Apply content filter if provided
433-
if self.content_filter and not self.content_filter(markdown):
422+
# Apply content filter from target
423+
if not self.target.filter_content(markdown):
434424
filtered_out += 1
435425
continue
436426

437-
# Apply content processor if provided (e.g., remove unwanted sections)
438-
if self.content_processor:
439-
markdown = self.content_processor(markdown)
427+
# Apply content processor from target (e.g., remove unwanted sections)
428+
markdown = self.target.process_content(markdown)
440429

441430
if not markdown or len(markdown.strip()) < 50:
442431
markdown = "*No content extracted.*"
@@ -458,7 +447,7 @@ def compile_markdown(self) -> str:
458447
error = page_data.get('error', 'Unknown error')
459448
logger.info(f"Skipping {url}: {error}")
460449

461-
logger.info(f"Filtered out {filtered_out} pages based on content filter: {self.content_filter}")
450+
logger.info(f"Filtered out {filtered_out} pages based on content filter")
462451
return '\n'.join(lines)
463452

464453
async def run(self, output_path: Optional[Path] = None) -> Path:
@@ -510,10 +499,7 @@ async def run(self, output_path: Optional[Path] = None) -> Path:
510499
markdown_content = self.compile_markdown()
511500

512501
# Save markdown
513-
if output_path is None:
514-
output_path = OUT_FILE.with_suffix('.md')
515-
else:
516-
output_path = Path(output_path)
502+
output_path = OUT_FILE.with_suffix('.md') if output_path is None else Path(output_path)
517503

518504
logger.info(f"Saving markdown to: {output_path}")
519505
output_path.write_text(markdown_content, encoding='utf-8')

0 commit comments

Comments
 (0)