Skip to content

Commit d5cf6d8

Browse files
author
Ahmed Nassar
committed
rebased
1 parent 57ad0e9 commit d5cf6d8

File tree

1 file changed

+239
-0
lines changed

1 file changed

+239
-0
lines changed

test/test_doctags_filtering.py

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
"""Test DocTags serialization filtering functionality."""
2+
3+
from docling_core.transforms.serializer.doctags import (
4+
DocTagsDocSerializer,
5+
DocTagsParams,
6+
create_task_filtered_params,
7+
)
8+
9+
10+
def test_create_task_filtered_params_defaults():
11+
"""Test default behavior."""
12+
params = create_task_filtered_params([])
13+
assert params.include_ocr is True
14+
assert params.include_layout is True
15+
assert params.include_otsl is True
16+
assert params.include_code is True
17+
assert params.include_picture is True
18+
assert params.include_chart is True
19+
assert params.include_formula is True
20+
21+
22+
def test_create_task_filtered_params_specific_tasks():
23+
"""Test with specific task list."""
24+
params = create_task_filtered_params(["ocr", "layout"])
25+
assert params.include_ocr is True
26+
assert params.include_layout is True
27+
assert params.include_otsl is False
28+
assert params.include_code is False
29+
assert params.include_picture is False
30+
assert params.include_chart is False
31+
assert params.include_formula is False
32+
33+
34+
def test_create_task_filtered_params_with_layout():
35+
"""Test with layout in task list."""
36+
params = create_task_filtered_params(["layout"])
37+
assert params.include_layout is True
38+
assert params.add_location is True
39+
40+
41+
def test_create_task_filtered_params_without_layout():
42+
"""Test without layout in task list."""
43+
params = create_task_filtered_params(["ocr"])
44+
assert params.include_layout is False
45+
assert params.add_location is False
46+
47+
48+
def test_create_task_filtered_params_with_kwargs():
49+
"""Test with additional kwargs."""
50+
params = create_task_filtered_params(["ocr"], xsize=1000, ysize=1000)
51+
assert params.xsize == 1000
52+
assert params.ysize == 1000
53+
assert params.include_ocr is True
54+
55+
56+
def test_doctags_exclude_ocr(sample_doc):
57+
"""Test excluding OCR."""
58+
serializer = DocTagsDocSerializer(doc=sample_doc)
59+
serializer.params = serializer.params.merge_with_patch(
60+
DocTagsParams(include_ocr=False).model_dump()
61+
)
62+
result = serializer.serialize()
63+
assert result.text is not None
64+
65+
66+
def test_doctags_exclude_otsl(sample_doc):
67+
"""Test excluding OTSL."""
68+
serializer = DocTagsDocSerializer(doc=sample_doc)
69+
serializer.params = serializer.params.merge_with_patch(
70+
DocTagsParams(include_otsl=False, include_layout=True).model_dump()
71+
)
72+
result = serializer.serialize()
73+
assert result.text is not None
74+
75+
76+
def test_doctags_exclude_picture(sample_doc):
77+
"""Test excluding pictures."""
78+
serializer = DocTagsDocSerializer(doc=sample_doc)
79+
serializer.params = serializer.params.merge_with_patch(
80+
DocTagsParams(include_picture=False).model_dump()
81+
)
82+
result = serializer.serialize()
83+
assert result.text is not None
84+
85+
86+
def test_doctags_exclude_chart(sample_doc):
87+
"""Test excluding charts."""
88+
serializer = DocTagsDocSerializer(doc=sample_doc)
89+
serializer.params = serializer.params.merge_with_patch(
90+
DocTagsParams(include_chart=False).model_dump()
91+
)
92+
result = serializer.serialize()
93+
assert result.text is not None
94+
95+
96+
def test_doctags_exclude_code(sample_doc):
97+
"""Test excluding code."""
98+
serializer = DocTagsDocSerializer(doc=sample_doc)
99+
serializer.params = serializer.params.merge_with_patch(
100+
DocTagsParams(include_code=False).model_dump()
101+
)
102+
result = serializer.serialize()
103+
assert result.text is not None
104+
105+
106+
def test_doctags_exclude_formula(sample_doc):
107+
"""Test excluding formulas."""
108+
serializer = DocTagsDocSerializer(doc=sample_doc)
109+
serializer.params = serializer.params.merge_with_patch(
110+
DocTagsParams(include_formula=False).model_dump()
111+
)
112+
result = serializer.serialize()
113+
assert result.text is not None
114+
115+
116+
def test_doctags_no_layout_no_locations(sample_doc):
117+
"""Test no locations when layout is disabled."""
118+
serializer = DocTagsDocSerializer(doc=sample_doc)
119+
serializer.params = serializer.params.merge_with_patch(
120+
DocTagsParams(include_layout=False, add_location=True).model_dump()
121+
)
122+
result = serializer.serialize()
123+
assert "<loc" not in result.text or result.text == ""
124+
125+
126+
def test_doctags_layout_with_locations(sample_doc):
127+
"""Test locations when layout is enabled."""
128+
serializer = DocTagsDocSerializer(doc=sample_doc)
129+
serializer.params = serializer.params.merge_with_patch(
130+
DocTagsParams(include_layout=True, add_location=True).model_dump()
131+
)
132+
result = serializer.serialize()
133+
assert result.text is not None
134+
135+
136+
def test_doctags_table_location_without_otsl(sample_doc):
137+
"""Test table locations without OTSL."""
138+
serializer = DocTagsDocSerializer(doc=sample_doc)
139+
serializer.params = serializer.params.merge_with_patch(
140+
DocTagsParams(
141+
include_otsl=False, include_layout=True, add_location=True
142+
).model_dump()
143+
)
144+
result = serializer.serialize()
145+
assert result.text is not None
146+
147+
148+
def test_doctags_table_caption_without_otsl(sample_doc):
149+
"""Test table captions without OTSL."""
150+
serializer = DocTagsDocSerializer(doc=sample_doc)
151+
serializer.params = serializer.params.merge_with_patch(
152+
DocTagsParams(
153+
include_otsl=False, include_layout=True, add_caption=True
154+
).model_dump()
155+
)
156+
result = serializer.serialize()
157+
assert result.text is not None
158+
159+
160+
def test_doctags_multiple_filters(sample_doc):
161+
"""Test multiple filters."""
162+
serializer = DocTagsDocSerializer(doc=sample_doc)
163+
serializer.params = serializer.params.merge_with_patch(
164+
DocTagsParams(
165+
include_ocr=True,
166+
include_otsl=False,
167+
include_picture=False,
168+
include_chart=False,
169+
include_code=False,
170+
include_formula=False,
171+
).model_dump()
172+
)
173+
result = serializer.serialize()
174+
assert result.text is not None
175+
176+
177+
def test_doctags_layout_mode_only(sample_doc):
178+
"""Test layout mode only."""
179+
serializer = DocTagsDocSerializer(doc=sample_doc)
180+
serializer.params = serializer.params.merge_with_patch(
181+
DocTagsParams(
182+
include_layout=True,
183+
layout_mode_only=True,
184+
add_location=True,
185+
add_content=False,
186+
).model_dump()
187+
)
188+
result = serializer.serialize()
189+
assert result.text is not None
190+
191+
192+
def test_doctags_params_mode_minified(sample_doc):
193+
"""Test minified mode."""
194+
serializer = DocTagsDocSerializer(doc=sample_doc)
195+
serializer.params = serializer.params.merge_with_patch(
196+
DocTagsParams(mode=DocTagsParams.Mode.MINIFIED).model_dump()
197+
)
198+
result = serializer.serialize()
199+
assert result.text is not None
200+
assert serializer.params.mode == DocTagsParams.Mode.MINIFIED
201+
202+
203+
def test_doctags_params_mode_human_friendly(sample_doc):
204+
"""Test human-friendly mode."""
205+
serializer = DocTagsDocSerializer(doc=sample_doc)
206+
serializer.params = serializer.params.merge_with_patch(
207+
DocTagsParams(mode=DocTagsParams.Mode.HUMAN_FRIENDLY).model_dump()
208+
)
209+
result = serializer.serialize()
210+
assert result.text is not None
211+
212+
213+
def test_doctags_picture_serializer_helper_methods(sample_doc):
214+
"""Test picture serializer helper methods."""
215+
serializer = DocTagsDocSerializer(doc=sample_doc)
216+
picture_serializer = serializer.picture_serializer
217+
218+
assert hasattr(picture_serializer, "_get_predicted_class")
219+
assert hasattr(picture_serializer, "_is_chart_type")
220+
assert hasattr(picture_serializer, "_get_molecule_smi")
221+
assert hasattr(picture_serializer, "_get_tabular_chart_data")
222+
assert hasattr(picture_serializer, "_build_body_content")
223+
224+
225+
def test_doctags_should_process_item_logic(sample_doc):
226+
"""Test item processing logic."""
227+
serializer = DocTagsDocSerializer(doc=sample_doc)
228+
229+
serializer.params = serializer.params.merge_with_patch(
230+
DocTagsParams(include_layout=True).model_dump()
231+
)
232+
result = serializer.serialize()
233+
assert result.text is not None
234+
235+
serializer.params = serializer.params.merge_with_patch(
236+
DocTagsParams(include_layout=True, include_otsl=False).model_dump()
237+
)
238+
result = serializer.serialize()
239+
assert result.text is not None

0 commit comments

Comments
 (0)