66
77import pytest
88
9- sys .path .insert (0 , str (Path (__file__ ).resolve ().parents [2 ] / "src" ))
9+ # Add the src directory to the path to ensure imports work from the root
10+ sys .path .insert (0 , str (Path (__file__ ).parent .parent / "src" ))
1011
1112from desi .processor .ds_processor import Document , DsWikiProcessor
1213
@@ -47,6 +48,21 @@ def sample_mermaid_diagram():
4748"""
4849
4950
51+ @pytest .fixture
52+ def sample_faq_content ():
53+ """Provides sample HTML content structured like an FAQ with <details> tags."""
54+ return """
55+ <details>
56+ <summary>What is the first question?</summary>
57+ This is the answer to the first question.
58+ </details>
59+ <details>
60+ <summary>What is the second question?</summary>
61+ <p>This is the answer to the second question, with more detail.</p>
62+ </details>
63+ """
64+
65+
5066# --- Tests for Individual Functions ---
5167
5268
@@ -115,6 +131,25 @@ def test_split_markdown_by_structure_creates_chunks():
115131 assert chunks [0 ].metadata ["origin" ] == "dswiki"
116132
117133
134+ def test_split_faq_style (sample_faq_content ):
135+ """Tests the FAQ chunking logic based on <details> HTML tags."""
136+ metadata = {"source" : "faq.md" }
137+ chunks = DsWikiProcessor ._split_faq_style (sample_faq_content , metadata )
138+
139+ assert len (chunks ) == 2
140+ # Verify content of the first chunk
141+ assert "Question: What is the first question?" in chunks [0 ].page_content
142+ assert "Answer: This is the answer to the first question." in chunks [0 ].page_content
143+ assert chunks [0 ].metadata ["faq_question" ] == "What is the first question?"
144+ # Verify content of the second chunk
145+ assert "Question: What is the second question?" in chunks [1 ].page_content
146+ assert (
147+ "Answer: This is the answer to the second question, with more detail."
148+ in chunks [1 ].page_content
149+ )
150+ assert chunks [1 ].metadata ["faq_question" ] == "What is the second question?"
151+
152+
118153def test_chunk_document_integration (tmp_path ):
119154 """An integration test for the main document chunking function."""
120155 # Create a fake directory structure
@@ -126,18 +161,56 @@ def test_chunk_document_integration(tmp_path):
126161 "---\n title: Integration Test\n ---\n ## A Section\n \n Some content here that is definitely long enough to pass the minimum length check after being processed."
127162 )
128163
129- chunks = DsWikiProcessor ._chunk_document (str (md_file ), str (root_dir ))
164+ processor = DsWikiProcessor (
165+ root_directory = str (root_dir ),
166+ output_directory = "dummy_output" ,
167+ chroma_persist_directory = "dummy_chroma" ,
168+ )
169+
170+ chunks = processor ._chunk_document (str (md_file ), str (root_dir ))
130171
131172 assert len (chunks ) == 1
132173 chunk = chunks [0 ]
133174
175+ # --- Verify enriched metadata ---
134176 assert chunk .metadata ["origin" ] == "dswiki"
177+ # Section name should be title-cased and cleaned
135178 assert chunk .metadata ["section" ] == "Use Cases"
179+ # Source should be a clean, relative path with forward slashes
136180 assert chunk .metadata ["source" ] == "use_cases/my-test-file.md"
137-
181+ # ID should be derived from the source path
138182 assert chunk .metadata ["id" ] == "dswiki-use_cases-my-test-file"
139-
140- assert (
141- "Integration Test" in chunk .metadata ["title" ]
142- ) # Assuming title is added in metadata
183+ # Metadata from YAML should be preserved
184+ assert chunk .metadata ["title" ] == "Integration Test"
185+ # Content should be present and cleaned
143186 assert "Some content here" in chunk .page_content
187+
188+
189+ def test_process_all_markdown_files (mocker ):
190+ """Tests the file discovery and processing orchestration."""
191+ mocker .patch ("os.path.isdir" , return_value = True )
192+
193+ # Mock os.walk to return a predictable directory structure
194+ mocker .patch ("os.walk" ).return_value = [
195+ ("/fake_root" , ("docs" ,), ("root.md" ,)),
196+ ("/fake_root/docs" , (), ("doc1.md" , "doc2.txt" )),
197+ ]
198+
199+ # Patch the method on the CLASS before an instance is created.
200+ mock_chunker = mocker .patch (
201+ "desi.processor.ds_processor.DsWikiProcessor._chunk_document"
202+ )
203+ mock_chunker .return_value = [Document ("chunk_content" , {})]
204+
205+ # Now, create the instance. It will be created with the mocked method.
206+ processor = DsWikiProcessor (
207+ root_directory = "/fake_root" , output_directory = "" , chroma_persist_directory = ""
208+ )
209+
210+ # Run the function that calls the (now mocked) method
211+ all_chunks = processor ._process_all_markdown_files ("/fake_root" )
212+
213+ # We expect 2 calls because there are two .md files (root.md, doc1.md)
214+ assert mock_chunker .call_count == 2
215+ # We expect a total of 2 chunks since the mock returns one chunk per call
216+ assert len (all_chunks ) == 2
0 commit comments