Skip to content

Commit d892116

Browse files
committed
chore: add relevant tests
Add relevant tests for the following: - markdown generator - http crawler strategy
1 parent 5bbb521 commit d892116

File tree

2 files changed

+155
-161
lines changed

2 files changed

+155
-161
lines changed
Lines changed: 80 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
import asyncio
2-
from typing import Dict
1+
import sys
2+
3+
from _pytest.mark.structures import ParameterSet # pyright: ignore[reportPrivateImportUsage]
34
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
45
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
56
import time
7+
import pytest
68

7-
# Test HTML samples
89
TEST_HTML_SAMPLES = {
910
"basic": """
1011
<body>
@@ -16,7 +17,7 @@
1617
</div>
1718
</body>
1819
""",
19-
20+
2021
"complex": """
2122
<body>
2223
<nav>Navigation menu that should be removed</nav>
@@ -27,7 +28,7 @@
2728
<p>Important content paragraph with <a href="http://test.com">useful link</a>.</p>
2829
<section>
2930
<h2>Key Section</h2>
30-
<p>Detailed explanation with multiple sentences. This should be kept
31+
<p>Detailed explanation with multiple sentences. This should be kept
3132
in the final output. Very important information here.</p>
3233
</section>
3334
</article>
@@ -36,7 +37,7 @@
3637
<footer>Footer content to remove</footer>
3738
</body>
3839
""",
39-
40+
4041
"edge_cases": """
4142
<body>
4243
<div>
@@ -50,10 +51,10 @@
5051
</div>
5152
</body>
5253
""",
53-
54+
5455
"links_citations": """
5556
<body>
56-
<h1>Document with Links</h1>
57+
<h1>Article with Links</h1>
5758
<p>First link to <a href="http://example.com/1">Example 1</a></p>
5859
<p>Second link to <a href="http://example.com/2" title="Example 2">Test 2</a></p>
5960
<p>Image link: <img src="test.jpg" alt="test image"></p>
@@ -62,110 +63,88 @@
6263
""",
6364
}
6465

65-
def test_content_filters() -> Dict[str, Dict[str, int]]:
66+
GENERATORS = {
67+
"no_filter": DefaultMarkdownGenerator(),
68+
"pruning": DefaultMarkdownGenerator(
69+
content_filter=PruningContentFilter(threshold=0.48)
70+
),
71+
"bm25": DefaultMarkdownGenerator(
72+
content_filter=BM25ContentFilter(
73+
user_query="test article content important"
74+
)
75+
)
76+
}
77+
78+
79+
def filter_params() -> list[ParameterSet]:
80+
"""Return a list of test parameters for the content filter tests."""
81+
return [
82+
pytest.param(html, id=name) for name, html in TEST_HTML_SAMPLES.items()
83+
]
84+
85+
@pytest.mark.parametrize("html", filter_params())
86+
@pytest.mark.skip(reason="Require BM25 idf calculation fix")
87+
def test_content_filters(html: str):
6688
"""Test various content filtering strategies and return length comparisons."""
67-
results = {}
68-
6989
# Initialize filters
7090
pruning_filter = PruningContentFilter(
7191
threshold=0.48,
7292
threshold_type="fixed",
7393
min_word_threshold=2
7494
)
75-
95+
7696
bm25_filter = BM25ContentFilter(
7797
bm25_threshold=1.0,
7898
user_query="test article content important"
7999
)
80-
81-
# Test each HTML sample
82-
for test_name, html in TEST_HTML_SAMPLES.items():
83-
# Store results for this test case
84-
results[test_name] = {}
85-
86-
# Test PruningContentFilter
87-
start_time = time.time()
88-
pruned_content = pruning_filter.filter_content(html)
89-
pruning_time = time.time() - start_time
90-
91-
# Test BM25ContentFilter
92-
start_time = time.time()
93-
bm25_content = bm25_filter.filter_content(html)
94-
bm25_time = time.time() - start_time
95-
96-
# Store results
97-
results[test_name] = {
98-
"original_length": len(html),
99-
"pruned_length": sum(len(c) for c in pruned_content),
100-
"bm25_length": sum(len(c) for c in bm25_content),
101-
"pruning_time": pruning_time,
102-
"bm25_time": bm25_time
103-
}
104-
105-
return results
106-
107-
def test_markdown_generation():
100+
101+
# Test PruningContentFilter
102+
start_time = time.time()
103+
pruned_content = pruning_filter.filter_content(html)
104+
pruning_time = time.time() - start_time
105+
106+
# Test BM25ContentFilter
107+
start_time = time.time()
108+
bm25_content = bm25_filter.filter_content(html)
109+
bm25_time = time.time() - start_time
110+
111+
assert len(pruned_content) > 0
112+
assert len(bm25_content) > 0
113+
print(f"Original length: {len(html)}")
114+
print(f"Pruned length: {sum(len(c) for c in pruned_content)} ({pruning_time:.3f}s)")
115+
print(f"BM25 length: {sum(len(c) for c in bm25_content)} ({bm25_time:.3f}s)")
116+
117+
118+
def markdown_params() -> list[ParameterSet]:
119+
"""Return a list of test parameters for the content filter tests."""
120+
params: list[ParameterSet] = []
121+
for name, html in TEST_HTML_SAMPLES.items():
122+
for gen_name, generator in GENERATORS.items():
123+
params.append(pytest.param(html, generator, id=f"{name}_{gen_name}"))
124+
return params
125+
126+
@pytest.mark.parametrize("html,generator", markdown_params())
127+
def test_markdown_generation(html: str, generator: DefaultMarkdownGenerator):
108128
"""Test markdown generation with different configurations."""
109-
results = []
110-
111-
# Initialize generators with different configurations
112-
generators = {
113-
"no_filter": DefaultMarkdownGenerator(),
114-
"pruning": DefaultMarkdownGenerator(
115-
content_filter=PruningContentFilter(threshold=0.48)
116-
),
117-
"bm25": DefaultMarkdownGenerator(
118-
content_filter=BM25ContentFilter(
119-
user_query="test article content important"
120-
)
121-
)
122-
}
123-
124-
# Test each generator with each HTML sample
125-
for test_name, html in TEST_HTML_SAMPLES.items():
126-
for gen_name, generator in generators.items():
127-
start_time = time.time()
128-
result = generator.generate_markdown(
129-
html,
130-
base_url="http://example.com",
131-
citations=True
132-
)
133-
134-
results.append({
135-
"test_case": test_name,
136-
"generator": gen_name,
137-
"time": time.time() - start_time,
138-
"raw_length": len(result.raw_markdown),
139-
"fit_length": len(result.fit_markdown) if result.fit_markdown else 0,
140-
"citations": len(result.references_markdown)
141-
})
142-
143-
return results
144-
145-
def main():
146-
"""Run all tests and print results."""
147-
print("Starting content filter tests...")
148-
filter_results = test_content_filters()
149-
150-
print("\nContent Filter Results:")
151-
print("-" * 50)
152-
for test_name, metrics in filter_results.items():
153-
print(f"\nTest case: {test_name}")
154-
print(f"Original length: {metrics['original_length']}")
155-
print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)")
156-
print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)")
157-
158-
print("\nStarting markdown generation tests...")
159-
markdown_results = test_markdown_generation()
160-
161-
print("\nMarkdown Generation Results:")
162-
print("-" * 50)
163-
for result in markdown_results:
164-
print(f"\nTest: {result['test_case']} - Generator: {result['generator']}")
165-
print(f"Time: {result['time']:.3f}s")
166-
print(f"Raw length: {result['raw_length']}")
167-
print(f"Fit length: {result['fit_length']}")
168-
print(f"Citations: {result['citations']}")
129+
130+
start_time = time.time()
131+
result = generator.generate_markdown(
132+
html,
133+
base_url="http://example.com",
134+
citations=True
135+
)
136+
137+
assert result is not None
138+
assert result.raw_markdown is not None
139+
assert result.fit_markdown is not None
140+
assert result.references_markdown is not None
141+
142+
print(f"Time: {time.time() - start_time:.3f}s")
143+
print(f"Raw length: {len(result.raw_markdown)}")
144+
print(f"Fit length: {len(result.fit_markdown) if result.fit_markdown else 0}")
145+
print(f"Citations: {len(result.references_markdown)}")
169146

170147
if __name__ == "__main__":
171-
main()
148+
import subprocess
149+
150+
sys.exit(subprocess.call(["pytest", *sys.argv[1:], sys.argv[0]]))

0 commit comments

Comments
 (0)