Skip to content

Commit a335899

Browse files
committed
fix: tests and associated bugs
This is a placeholder which contains all the fixes and improvements needed to get all the existing tests running error free. There are a small number of tests which have been marked as skipped due to flaky behaviour or significant functionality needed to make them work. The intention is to split this out into many more consumable changes.
1 parent 6eb42be commit a335899

File tree

103 files changed

+7049
-4894
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+7049
-4894
lines changed

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ coverage.xml
5050
.hypothesis/
5151
.pytest_cache/
5252
cover/
53+
tests/async/output/
5354

5455
# Translations
5556
*.mo

Diff for: crawl4ai/__init__.py

+49-44
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,71 @@
11
# __init__.py
22
import warnings
3+
from logging import Logger
34

4-
from .async_webcrawler import AsyncWebCrawler, CacheMode
55
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
6-
7-
from .content_scraping_strategy import (
8-
ContentScrapingStrategy,
9-
WebScrapingStrategy,
10-
LXMLWebScrapingStrategy,
6+
from .async_dispatcher import (
7+
BaseDispatcher,
8+
CrawlerMonitor,
9+
DisplayMode,
10+
MemoryAdaptiveDispatcher,
11+
RateLimiter,
12+
SemaphoreDispatcher,
1113
)
1214
from .async_logger import (
13-
AsyncLoggerBase,
1415
AsyncLogger,
16+
AsyncLoggerBase,
1517
)
16-
from .proxy_strategy import (
17-
ProxyRotationStrategy,
18-
RoundRobinProxyStrategy,
19-
)
20-
from .extraction_strategy import (
21-
ExtractionStrategy,
22-
LLMExtractionStrategy,
23-
CosineStrategy,
24-
JsonCssExtractionStrategy,
25-
JsonXPathExtractionStrategy,
26-
)
18+
from .async_webcrawler import AsyncWebCrawler, CacheMode
19+
from .browser_profiler import BrowserProfiler
2720
from .chunking_strategy import ChunkingStrategy, RegexChunking
28-
from .markdown_generation_strategy import DefaultMarkdownGenerator
2921
from .content_filter_strategy import (
30-
PruningContentFilter,
3122
BM25ContentFilter,
3223
LLMContentFilter,
24+
PruningContentFilter,
3325
RelevantContentFilter,
3426
)
35-
from .models import CrawlResult, MarkdownGenerationResult
36-
from .async_dispatcher import (
37-
MemoryAdaptiveDispatcher,
38-
SemaphoreDispatcher,
39-
RateLimiter,
40-
CrawlerMonitor,
41-
DisplayMode,
42-
BaseDispatcher,
27+
from .content_scraping_strategy import (
28+
ContentScrapingStrategy,
29+
LXMLWebScrapingStrategy,
30+
WebScrapingStrategy,
4331
)
44-
from .docker_client import Crawl4aiDockerClient
45-
from .hub import CrawlerHub
46-
from .browser_profiler import BrowserProfiler
4732
from .deep_crawling import (
48-
DeepCrawlStrategy,
33+
BestFirstCrawlingStrategy,
4934
BFSDeepCrawlStrategy,
50-
FilterChain,
51-
URLPatternFilter,
52-
DomainFilter,
53-
ContentTypeFilter,
54-
URLFilter,
55-
FilterStats,
56-
SEOFilter,
57-
KeywordRelevanceScorer,
58-
URLScorer,
5935
CompositeScorer,
36+
ContentTypeFilter,
37+
DeepCrawlDecorator,
38+
DeepCrawlStrategy,
39+
DFSDeepCrawlStrategy,
6040
DomainAuthorityScorer,
41+
DomainFilter,
42+
FilterChain,
43+
FilterStats,
6144
FreshnessScorer,
45+
KeywordRelevanceScorer,
6246
PathDepthScorer,
63-
BestFirstCrawlingStrategy,
64-
DFSDeepCrawlStrategy,
65-
DeepCrawlDecorator,
47+
SEOFilter,
48+
URLFilter,
49+
URLPatternFilter,
50+
URLScorer,
51+
)
52+
from .deep_crawling.scorers import (
53+
ScoringStats,
54+
)
55+
from .docker_client import Crawl4aiDockerClient
56+
from .extraction_strategy import (
57+
CosineStrategy,
58+
ExtractionStrategy,
59+
JsonCssExtractionStrategy,
60+
JsonXPathExtractionStrategy,
61+
LLMExtractionStrategy,
62+
)
63+
from .hub import CrawlerHub
64+
from .markdown_generation_strategy import DefaultMarkdownGenerator
65+
from .models import CrawlResult, MarkdownGenerationResult
66+
from .proxy_strategy import (
67+
ProxyRotationStrategy,
68+
RoundRobinProxyStrategy,
6669
)
6770

6871
__all__ = [
@@ -120,6 +123,8 @@
120123
"Crawl4aiDockerClient",
121124
"ProxyRotationStrategy",
122125
"RoundRobinProxyStrategy",
126+
"ScoringStats",
127+
"Logger",
123128
]
124129

125130

0 commit comments

Comments
 (0)