unclecode
diff --git a/‎tests/20241401/test_advanced_deep_crawl.py
Lines changed: 24 additions & 6 deletions b/‎tests/20241401/test_advanced_deep_crawl.py
Lines changed: 24 additions & 6 deletions
diff --git a/‎tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py renamed to ‎tests/20241401/test_async_crawl_with_http_crawler_strategy.py
Lines changed: 33 additions & 26 deletions b/‎tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py renamed to ‎tests/20241401/test_async_crawl_with_http_crawler_strategy.py
Lines changed: 33 additions & 26 deletions
@@ -1,29 +1,35 @@
-import asyncio
+import sys
 import time
 
+from httpx import codes
+import pytest
 
 from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
 from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
-from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
 from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
-# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.types import CrawlResult
 
 
-async def main():
+@pytest.mark.asyncio
+@pytest.mark.timeout(60)
+async def test_deep_crawl():
     """Example deep crawl of documentation site."""
     filter_chain = FilterChain([
         URLPatternFilter(patterns=["*2025*"]),
         DomainFilter(allowed_domains=["techcrunch.com"]),
         ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
         ContentTypeFilter(allowed_types=["text/html","application/javascript"])
     ])
+    max_pages: int = 5
     config = CrawlerRunConfig(
         deep_crawl_strategy = BestFirstCrawlingStrategy(
             max_depth=2,
             include_external=False,
             filter_chain=filter_chain,
             url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
+            max_pages=max_pages,
         ),
         stream=False,
         verbose=True,
@@ -35,12 +41,24 @@ async def main():
         print("Starting deep crawl in streaming mode:")
         config.stream = True
         start_time = time.perf_counter()
+        result: CrawlResult
+        pages: int = 0
         async for result in await crawler.arun(
             url="https://techcrunch.com",
             config=config
         ):
+            assert result.status_code == codes.OK
+            assert result.url
+            assert result.metadata
+            assert result.metadata.get("depth", -1) >= 0
+            assert result.metadata.get("depth", -1) <= 2
+            pages += 1
             print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
-        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
+
+        print(f"Crawled {pages} pages in: {time.perf_counter() - start_time:.2f} seconds")
+        assert pages == max_pages
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    import subprocess
+
+    sys.exit(subprocess.call(["pytest", *sys.argv[1:], sys.argv[0]]))
@@ -1,16 +1,30 @@
-import asyncio
+import sys
+
+from httpx import codes
+import pytest
+
 from crawl4ai import (
     AsyncWebCrawler,
-    CrawlerRunConfig,
-    HTTPCrawlerConfig,
     CacheMode,
+    CrawlerRunConfig,
     DefaultMarkdownGenerator,
-    PruningContentFilter
+    HTTPCrawlerConfig,
+    PruningContentFilter,
 )
 from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
 from crawl4ai.async_logger import AsyncLogger
 
-async def main():
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "url",
+    [
+        "https://example.com",
+        "https://httpbin.org/get",
+        "raw://<html><body>Test content</body></html>"
+    ]
+)
+async def test_async_crawl(url: str):
     # Initialize HTTP crawler strategy
     http_strategy = AsyncHTTPCrawlerStrategy(
         browser_config=HTTPCrawlerConfig(
@@ -27,30 +41,23 @@ async def main():
             cache_mode=CacheMode.BYPASS,
             markdown_generator=DefaultMarkdownGenerator(
                 content_filter=PruningContentFilter(
-                    threshold=0.48, 
-                    threshold_type="fixed", 
+                    threshold=0.48,
+                    threshold_type="fixed",
                     min_word_threshold=0
                 )
             )
         )
-        
-        # Test different URLs
-        urls = [
-            "https://example.com",
-            "https://httpbin.org/get",
-            "raw://<html><body>Test content</body></html>"
-        ]
-        
-        for url in urls:
-            print(f"\n=== Testing {url} ===")
-            try:
-                result = await crawler.arun(url=url, config=crawler_config)
-                print(f"Status: {result.status_code}")
-                print(f"Raw HTML length: {len(result.html)}")
-                if hasattr(result, 'markdown'):
-                    print(f"Markdown length: {len(result.markdown.raw_markdown)}")
-            except Exception as e:
-                print(f"Error: {e}")
+
+        result = await crawler.arun(url=url, config=crawler_config)
+        assert result.status_code == codes.OK
+        assert result.html
+        assert result.markdown
+        assert result.markdown.raw_markdown
+        print(f"Status: {result.status_code}")
+        print(f"Raw HTML length: {len(result.html)}")
+        print(f"Markdown length: {len(result.markdown.raw_markdown)}")
 
 if __name__ == "__main__":
-    asyncio.run(main())
+    import subprocess
+
+    sys.exit(subprocess.call(["pytest", *sys.argv[1:], sys.argv[0]]))