Skip to content

Commit 58d1287

Browse files
committed
Tests for openbis_processor
1 parent 1093c2b commit 58d1287

File tree

3 files changed

+119
-70
lines changed

3 files changed

+119
-70
lines changed

desi_vectordb/chroma.sqlite3

0 Bytes
Binary file not shown.

src/desi/processor/openbis_processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ def _create_and_persist_vectordb(chunks: List[Document], persist_directory: str)
292292
logger.info("-> Vector database processing complete.")
293293

294294
@staticmethod
295-
def export_chunks(chunks: List[Document], output_dir: str):
295+
def _export_chunks(chunks: List[Document], output_dir: str):
296296
"""Exports the list of Document chunks to JSON, CSV, and JSONL files."""
297297
if not os.path.exists(output_dir):
298298
os.makedirs(output_dir)

tests/test_openbis_processor.py

Lines changed: 118 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,159 @@
11
# Make sure the src directory is in the path for imports
2+
import os
23
import sys
34
from pathlib import Path
4-
from unittest.mock import mock_open
5+
from unittest.mock import call
56

67
import pytest
78

8-
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "src"))
9+
# Add the src directory to the path to ensure imports work from the root
10+
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
911

10-
from desi.processor.openbis_processor import (
11-
ContentChunker,
12-
Document,
13-
OpenBisProcessor,
14-
)
15-
16-
# --- Tests for the Cleaning Function ---
12+
from langchain_core.documents import Document
1713

14+
from desi.processor.openbis_processor import ContentChunker, OpenBisProcessor
1815

19-
def test_clean_removes_permalink():
20-
"""Tests that the [] permalink artifact is removed."""
21-
dirty = '## My Header[](#my-header "Permalink to this heading")'
22-
clean = "## My Header"
23-
assert OpenBisProcessor._clean_markdown_content(dirty) == clean
24-
25-
26-
def test_clean_dedents_code_block():
27-
"""Tests that indented code blocks are properly dedented."""
28-
dirty = " // This is some code\n if (true) {\n // more code\n }"
29-
clean = "// This is some code\nif (true) {\n // more code\n}"
30-
assert OpenBisProcessor._clean_markdown_content(dirty) == clean
16+
# --- Tests for the Cleaning Function ---
3117

3218

33-
def test_clean_handles_nbsp_and_dedents():
34-
"""Tests that non-breaking spaces are handled correctly, allowing dedent to work."""
35-
dirty = " line 1\n\u00a0\n line 2" # \u00a0 is a non-breaking space
36-
clean = "line 1\n\nline 2"
37-
assert OpenBisProcessor._clean_markdown_content(dirty) == clean
19+
@pytest.mark.parametrize(
20+
"dirty_input, expected_clean",
21+
[
22+
('## My Header[](#my-header "Permalink to this heading")', "## My Header"),
23+
(" // code block", "// code block"),
24+
(" line 1\n\u00a0\n line 2", "line 1\n\nline 2"),
25+
("Paragraph 1\n\n\n\nParagraph 2", "Paragraph 1\n\nParagraph 2"),
26+
("This is a clean line.", "This is a clean line."),
27+
("Title[](...)", "Title[](...)"),
28+
],
29+
)
30+
def test_clean_markdown_content(dirty_input, expected_clean):
31+
"""Tests various scenarios for the markdown cleaning logic."""
32+
assert OpenBisProcessor._clean_markdown_content(dirty_input) == expected_clean
3833

3934

40-
def test_clean_collapses_newlines():
41-
"""Tests that more than two newlines are collapsed."""
42-
dirty = "Paragraph 1\n\n\n\nParagraph 2"
43-
clean = "Paragraph 1\n\nParagraph 2"
44-
assert OpenBisProcessor._clean_markdown_content(dirty) == clean
35+
# --- Tests for the ContentChunker Logic ---
4536

4637

47-
# --- Tests for the Chunking Logic ---
38+
@pytest.fixture
39+
def chunker():
40+
"""Provides a default ContentChunker instance."""
41+
return ContentChunker(min_chunk_size=100, max_chunk_size=300)
4842

4943

50-
def test_content_chunker_splits_correctly():
51-
"""Tests the openBIS header-aware chunking logic."""
52-
chunker = ContentChunker(min_chunk_size=100, max_chunk_size=200)
44+
def test_content_chunker_splits_by_h2(chunker):
45+
"""Tests that content is split correctly at H2 (##) headers."""
5346
long_content = (
54-
"## Section Alpha\nThis is content for the first section. It is long enough to be its own chunk probably."
55-
"\n\n### Subsection A\nMore content here.\n\n"
56-
"## Section Beta\nThis is the second section. It should definitely start a new chunk because it is a new H2."
47+
"## Section Alpha\nThis is content for the first section. It is long enough to be its own chunk and should not be merged with the next one."
48+
"\n\n### Subsection A\nMore content here that belongs to Section Alpha.\n\n"
49+
"## Section Beta\nThis is the second section. It should definitely start a new chunk because it is a new H2 header."
5750
)
58-
5951
chunks = chunker.chunk_content(long_content)
60-
6152
assert len(chunks) == 2
62-
assert "## Section Alpha" in chunks[0]
63-
assert "## Section Beta" in chunks[1]
53+
assert chunks[0].startswith("## Section Alpha")
54+
assert "Subsection A" in chunks[0]
55+
assert chunks[1].startswith("## Section Beta")
6456

6557

66-
def test_content_chunker_keeps_short_content_as_one_chunk():
67-
"""Tests that short documents are not split."""
68-
chunker = ContentChunker()
69-
short_content = "## A Title\n\nJust a little bit of text."
58+
def test_content_chunker_keeps_short_content_as_one_chunk(chunker):
59+
"""Tests that short documents are not split, even if they have headers."""
60+
short_content = "## A Title\n\nJust a little bit of text here."
7061
chunks = chunker.chunk_content(short_content)
7162
assert len(chunks) == 1
7263
assert chunks[0] == short_content
7364

7465

75-
# --- Integration Test for the Main Function ---
66+
def test_content_chunker_splits_long_section_while_preserving_context(chunker):
67+
"""Tests that a section exceeding max_chunk_size is split, and the header context is re-added."""
68+
long_section = (
69+
"## Very Long Section\n\n"
70+
"This is the first paragraph of a very long section. It contains a lot of text to ensure it will exceed the maximum chunk size of 300 characters defined in the fixture. We will keep adding sentences to pad it out. More text here to fill space. And even more text. \n\n"
71+
"This is the second paragraph which should definitely be in a new chunk. By placing this text here, we force the chunker to make a decision and split the content, and the test will verify that the '## Very Long Section' header is prepended to this new chunk."
72+
)
73+
chunks = chunker.chunk_content(long_section)
74+
assert len(chunks) == 2
75+
assert chunks[0].startswith("## Very Long Section")
76+
assert "first paragraph" in chunks[0]
77+
assert "second paragraph" not in chunks[0]
78+
assert chunks[1].startswith("## Very Long Section")
79+
assert "second paragraph" in chunks[1]
80+
81+
82+
# --- Tests for the Main Processor Class ---
7683

7784

78-
def test_chunk_openbis_document_integration(tmp_path):
79-
"""Tests the full processing of a single openBIS file, including metadata enrichment."""
80-
# Create a fake directory and file
81-
root_dir = tmp_path
82-
md_file = (
83-
root_dir
84-
/ "en_20.10.0-11_user-documentation_general-users_managing-lab-stocks.md"
85+
@pytest.fixture
86+
def processor(tmp_path):
87+
"""Provides an OpenBisProcessor instance initialized with a temporary directory."""
88+
return OpenBisProcessor(
89+
root_directory=str(tmp_path),
90+
output_directory=str(tmp_path / "processed"),
91+
chroma_persist_directory=str(tmp_path / "vectordb"),
8592
)
86-
md_file.write_text("## Managing Lab Stocks\n\nThis is some sample content.")
8793

88-
processor = OpenBisProcessor(
89-
root_directory=str(root_dir),
90-
output_directory="dummy_output",
91-
chroma_persist_directory="dummy_chroma",
94+
95+
def test_chunk_openbis_document_integration(processor, tmp_path):
96+
"""
97+
Tests the full processing of a single file, including cleaning, metadata enrichment, and chunking.
98+
"""
99+
file_name = "en_20.10.0-11_user-documentation_general-users_managing-lab-stocks.md"
100+
md_file = tmp_path / file_name
101+
102+
md_file.write_text(
103+
'## Managing Lab Stocks[](#my-header "Permalink to this heading")\n\nThis is some sample content.',
104+
encoding="utf-8",
92105
)
93106

94-
chunks = processor._chunk_openbis_document(str(md_file))
107+
chunks = processor._chunk_openbis_document(str(md_file), str(tmp_path))
95108

96109
assert len(chunks) == 1
97110
chunk = chunks[0]
98-
99-
# Verify metadata
100111
assert chunk.metadata["origin"] == "openbis"
101112
assert chunk.metadata["section"] == "User Documentation"
102-
assert chunk.metadata["source"].endswith("managing-lab-stocks.md")
103-
assert chunk.metadata["id"].startswith(
104-
"openbis-en_20.10.0-11_user-documentation_general-users_managing-lab-stocks-"
113+
assert chunk.metadata["source"] == file_name.replace("\\", "/")
114+
assert (
115+
chunk.metadata["id"]
116+
== "openbis-en_20.10.0-11_user-documentation_general-users_managing-lab-stocks-0"
105117
)
106-
assert chunk.metadata["title"] == "Managing lab stocks"
107-
assert "https://openbis.readthedocs.io/" in chunk.metadata["url"]
108-
109-
# Verify content
110118
assert "## Managing Lab Stocks" in chunk.page_content
119+
assert "[]" not in chunk.page_content
120+
121+
122+
def test_process_all_openbis_files(mocker, processor):
123+
"""Tests the file discovery and processing orchestration."""
124+
mocker.patch("os.path.isdir", return_value=True)
125+
mocker.patch("os.walk").return_value = [
126+
("/fake_root", (), ("doc1.md", "doc2.txt", "doc3.md")),
127+
]
128+
mock_chunker = mocker.patch.object(
129+
processor, "_chunk_openbis_document", return_value=[Document(page_content="")]
130+
)
131+
132+
result_chunks = processor._process_all_openbis_files("/fake_root")
133+
134+
assert mock_chunker.call_count == 2
135+
assert len(result_chunks) == 2
136+
expected_calls = [
137+
call(os.path.join("/fake_root", "doc1.md"), "/fake_root"),
138+
call(os.path.join("/fake_root", "doc3.md"), "/fake_root"),
139+
]
140+
mock_chunker.assert_has_calls(expected_calls, any_order=True)
141+
142+
143+
def test_process_method_orchestration(mocker, processor):
144+
"""
145+
Tests that the main `process` method calls its helpers in the correct sequence.
146+
"""
147+
mock_process_files = mocker.patch.object(
148+
processor,
149+
"_process_all_openbis_files",
150+
return_value=[Document(page_content="chunk", metadata={})],
151+
)
152+
mock_export = mocker.patch.object(processor, "_export_chunks")
153+
mock_vectordb = mocker.patch.object(processor, "_create_and_persist_vectordb")
154+
155+
processor.process()
156+
157+
mock_process_files.assert_called_once()
158+
mock_export.assert_called_once()
159+
mock_vectordb.assert_called_once()

0 commit comments

Comments
 (0)