Skip to content

Commit 85d3a85

Browse files
committed
fix: tests to run under pytest
Fix tests to run fully under pytest, this includes: - Fixing filenames, removing dots and correct typos - Removing __init__ methods, which are not supported by pytest - Implement parametrisation so tests can be run individually - Added timeouts so tests can't run forever - Replacing print and logging with assertions to prevent false successes - Removing unused and add missing imports - Mark tests with @pytest.mark.asyncio where appropriate - Use http constants to avoid magic numbers - Add type hints to improve linting and identify issues - Use local server for API tests to improve debugging and eliminate docker dependency - Call pytest in __main__ to allow running tests from command line - Skip broken tests - Fix out of date logic and invalid method parameters - Re-enable disabled and commented out tests after fixing them - Added missing test data - Updated tests that depend on altered external css or html structure - Automatically skip if tests if API key is not set If you need to debug a test, which will take time, you will need to comment out the default timeout in pyproject.toml under [tool.pytest.ini_options].
1 parent e09474f commit 85d3a85

File tree

72 files changed

+5145
-4677
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+5145
-4677
lines changed

Diff for: tests/20241401/test_advanced_deep_crawl.py

+24-6
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,35 @@
1-
import asyncio
1+
import sys
22
import time
33

4+
from httpx import codes
5+
import pytest
46

57
from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
68
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
7-
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
9+
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
810
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
911
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
10-
# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
12+
from crawl4ai.types import CrawlResult
1113

1214

13-
async def main():
15+
@pytest.mark.asyncio
16+
@pytest.mark.timeout(60)
17+
async def test_deep_crawl():
1418
"""Example deep crawl of documentation site."""
1519
filter_chain = FilterChain([
1620
URLPatternFilter(patterns=["*2025*"]),
1721
DomainFilter(allowed_domains=["techcrunch.com"]),
1822
ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
1923
ContentTypeFilter(allowed_types=["text/html","application/javascript"])
2024
])
25+
max_pages: int = 5
2126
config = CrawlerRunConfig(
2227
deep_crawl_strategy = BestFirstCrawlingStrategy(
2328
max_depth=2,
2429
include_external=False,
2530
filter_chain=filter_chain,
2631
url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
32+
max_pages=max_pages,
2733
),
2834
stream=False,
2935
verbose=True,
@@ -35,12 +41,24 @@ async def main():
3541
print("Starting deep crawl in streaming mode:")
3642
config.stream = True
3743
start_time = time.perf_counter()
44+
result: CrawlResult
45+
pages: int = 0
3846
async for result in await crawler.arun(
3947
url="https://techcrunch.com",
4048
config=config
4149
):
50+
assert result.status_code == codes.OK
51+
assert result.url
52+
assert result.metadata
53+
assert result.metadata.get("depth", -1) >= 0
54+
assert result.metadata.get("depth", -1) <= 2
55+
pages += 1
4256
print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
43-
print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
57+
58+
print(f"Crawled {pages} pages in: {time.perf_counter() - start_time:.2f} seconds")
59+
assert pages == max_pages
4460

4561
if __name__ == "__main__":
46-
asyncio.run(main())
62+
import subprocess
63+
64+
sys.exit(subprocess.call(["pytest", *sys.argv[1:], sys.argv[0]]))
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,30 @@
1-
import asyncio
1+
import sys
2+
3+
from httpx import codes
4+
import pytest
5+
26
from crawl4ai import (
37
AsyncWebCrawler,
4-
CrawlerRunConfig,
5-
HTTPCrawlerConfig,
68
CacheMode,
9+
CrawlerRunConfig,
710
DefaultMarkdownGenerator,
8-
PruningContentFilter
11+
HTTPCrawlerConfig,
12+
PruningContentFilter,
913
)
1014
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
1115
from crawl4ai.async_logger import AsyncLogger
1216

13-
async def main():
17+
18+
@pytest.mark.asyncio
19+
@pytest.mark.parametrize(
20+
"url",
21+
[
22+
"https://example.com",
23+
"https://httpbin.org/get",
24+
"raw://<html><body>Test content</body></html>"
25+
]
26+
)
27+
async def test_async_crawl(url: str):
1428
# Initialize HTTP crawler strategy
1529
http_strategy = AsyncHTTPCrawlerStrategy(
1630
browser_config=HTTPCrawlerConfig(
@@ -27,30 +41,23 @@ async def main():
2741
cache_mode=CacheMode.BYPASS,
2842
markdown_generator=DefaultMarkdownGenerator(
2943
content_filter=PruningContentFilter(
30-
threshold=0.48,
31-
threshold_type="fixed",
44+
threshold=0.48,
45+
threshold_type="fixed",
3246
min_word_threshold=0
3347
)
3448
)
3549
)
36-
37-
# Test different URLs
38-
urls = [
39-
"https://example.com",
40-
"https://httpbin.org/get",
41-
"raw://<html><body>Test content</body></html>"
42-
]
43-
44-
for url in urls:
45-
print(f"\n=== Testing {url} ===")
46-
try:
47-
result = await crawler.arun(url=url, config=crawler_config)
48-
print(f"Status: {result.status_code}")
49-
print(f"Raw HTML length: {len(result.html)}")
50-
if hasattr(result, 'markdown'):
51-
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
52-
except Exception as e:
53-
print(f"Error: {e}")
50+
51+
result = await crawler.arun(url=url, config=crawler_config)
52+
assert result.status_code == codes.OK
53+
assert result.html
54+
assert result.markdown
55+
assert result.markdown.raw_markdown
56+
print(f"Status: {result.status_code}")
57+
print(f"Raw HTML length: {len(result.html)}")
58+
print(f"Markdown length: {len(result.markdown.raw_markdown)}")
5459

5560
if __name__ == "__main__":
56-
asyncio.run(main())
61+
import subprocess
62+
63+
sys.exit(subprocess.call(["pytest", *sys.argv[1:], sys.argv[0]]))

0 commit comments

Comments
 (0)