Skip to content

Commit 1093c2b

Browse files
committed
Improved tests for ds_processor
1 parent bdf6a70 commit 1093c2b

File tree

4 files changed

+89
-8
lines changed

4 files changed

+89
-8
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,12 @@ jobs:
5252
uv pip install coveralls
5353
5454
- name: Mypy Type Checking
55+
# --- THE FIX IS HERE ---
56+
# This step will now ONLY run on pull requests or on pushes to the main branch.
57+
# It will be SKIPPED on pushes to feature branches.
58+
if: github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/main')
5559
run: |
60+
echo "Running Mypy because this is a pull request or a push to main..."
5661
python -m mypy --ignore-missing-imports --follow-imports=silent --no-strict-optional src/desi tests
5762
5863
- name: Run Unit Tests & Build coverage file

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Data folders
2+
data/
3+
14
# Byte-compiled / optimized / DLL files
25
__pycache__/
36
*.py[codz]

src/desi/processor/ds_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def _split_markdown_by_structure(
306306
return chunks
307307

308308
# --- Step 3: The "Smart Dispatcher" ---
309-
def _chunk_document(self, file_path, root_directory): # <-- Pass root_directory
309+
def _chunk_document(self, file_path, root_directory):
310310
"""
311311
Loads a document, enriches metadata, and routes it to the best splitting strategy.
312312
"""

tests/test_ds_processor.py

Lines changed: 80 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66

77
import pytest
88

9-
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
9+
# Add the src directory to the path to ensure imports work from the root
10+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
1011

1112
from desi.processor.ds_processor import Document, DsWikiProcessor
1213

@@ -47,6 +48,21 @@ def sample_mermaid_diagram():
4748
"""
4849

4950

51+
@pytest.fixture
52+
def sample_faq_content():
53+
"""Provides sample HTML content structured like an FAQ with <details> tags."""
54+
return """
55+
<details>
56+
<summary>What is the first question?</summary>
57+
This is the answer to the first question.
58+
</details>
59+
<details>
60+
<summary>What is the second question?</summary>
61+
<p>This is the answer to the second question, with more detail.</p>
62+
</details>
63+
"""
64+
65+
5066
# --- Tests for Individual Functions ---
5167

5268

@@ -115,6 +131,25 @@ def test_split_markdown_by_structure_creates_chunks():
115131
assert chunks[0].metadata["origin"] == "dswiki"
116132

117133

134+
def test_split_faq_style(sample_faq_content):
135+
"""Tests the FAQ chunking logic based on <details> HTML tags."""
136+
metadata = {"source": "faq.md"}
137+
chunks = DsWikiProcessor._split_faq_style(sample_faq_content, metadata)
138+
139+
assert len(chunks) == 2
140+
# Verify content of the first chunk
141+
assert "Question: What is the first question?" in chunks[0].page_content
142+
assert "Answer: This is the answer to the first question." in chunks[0].page_content
143+
assert chunks[0].metadata["faq_question"] == "What is the first question?"
144+
# Verify content of the second chunk
145+
assert "Question: What is the second question?" in chunks[1].page_content
146+
assert (
147+
"Answer: This is the answer to the second question, with more detail."
148+
in chunks[1].page_content
149+
)
150+
assert chunks[1].metadata["faq_question"] == "What is the second question?"
151+
152+
118153
def test_chunk_document_integration(tmp_path):
119154
"""An integration test for the main document chunking function."""
120155
# Create a fake directory structure
@@ -126,18 +161,56 @@ def test_chunk_document_integration(tmp_path):
126161
"---\ntitle: Integration Test\n---\n## A Section\n\nSome content here that is definitely long enough to pass the minimum length check after being processed."
127162
)
128163

129-
chunks = DsWikiProcessor._chunk_document(str(md_file), str(root_dir))
164+
processor = DsWikiProcessor(
165+
root_directory=str(root_dir),
166+
output_directory="dummy_output",
167+
chroma_persist_directory="dummy_chroma",
168+
)
169+
170+
chunks = processor._chunk_document(str(md_file), str(root_dir))
130171

131172
assert len(chunks) == 1
132173
chunk = chunks[0]
133174

175+
# --- Verify enriched metadata ---
134176
assert chunk.metadata["origin"] == "dswiki"
177+
# Section name should be title-cased and cleaned
135178
assert chunk.metadata["section"] == "Use Cases"
179+
# Source should be a clean, relative path with forward slashes
136180
assert chunk.metadata["source"] == "use_cases/my-test-file.md"
137-
181+
# ID should be derived from the source path
138182
assert chunk.metadata["id"] == "dswiki-use_cases-my-test-file"
139-
140-
assert (
141-
"Integration Test" in chunk.metadata["title"]
142-
) # Assuming title is added in metadata
183+
# Metadata from YAML should be preserved
184+
assert chunk.metadata["title"] == "Integration Test"
185+
# Content should be present and cleaned
143186
assert "Some content here" in chunk.page_content
187+
188+
189+
def test_process_all_markdown_files(mocker):
190+
"""Tests the file discovery and processing orchestration."""
191+
mocker.patch("os.path.isdir", return_value=True)
192+
193+
# Mock os.walk to return a predictable directory structure
194+
mocker.patch("os.walk").return_value = [
195+
("/fake_root", ("docs",), ("root.md",)),
196+
("/fake_root/docs", (), ("doc1.md", "doc2.txt")),
197+
]
198+
199+
# Patch the method on the CLASS before an instance is created.
200+
mock_chunker = mocker.patch(
201+
"desi.processor.ds_processor.DsWikiProcessor._chunk_document"
202+
)
203+
mock_chunker.return_value = [Document("chunk_content", {})]
204+
205+
# Now, create the instance. It will be created with the mocked method.
206+
processor = DsWikiProcessor(
207+
root_directory="/fake_root", output_directory="", chroma_persist_directory=""
208+
)
209+
210+
# Run the function that calls the (now mocked) method
211+
all_chunks = processor._process_all_markdown_files("/fake_root")
212+
213+
# We expect 2 calls because there are two .md files (root.md, doc1.md)
214+
assert mock_chunker.call_count == 2
215+
# We expect a total of 2 chunks since the mock returns one chunk per call
216+
assert len(all_chunks) == 2

0 commit comments

Comments
 (0)