Skip to content

Commit c9af4d6

Browse files
committed
Merge from main, update tests
Signed-off-by: Christoph Auer <[email protected]>
2 parents 7cc569a + 7094828 commit c9af4d6

File tree

64 files changed

+92973
-648
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+92973
-648
lines changed

CHANGELOG.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,34 @@
1+
## [v2.34.2](https://github.com/docling-project/docling-core/releases/tag/v2.34.2) - 2025-06-10
2+
3+
### Fix
4+
5+
* Fix doc traversal for item deletion ([#324](https://github.com/docling-project/docling-core/issues/324)) ([`076ad2b`](https://github.com/docling-project/docling-core/commit/076ad2ba9bd551daf1ee6a0141d360a7be8a18bd))
6+
7+
## [v2.34.1](https://github.com/docling-project/docling-core/releases/tag/v2.34.1) - 2025-06-08
8+
9+
### Fix
10+
11+
* Warn when adding misplaced ListItem via API ([#321](https://github.com/docling-project/docling-core/issues/321)) ([`01b27b5`](https://github.com/docling-project/docling-core/commit/01b27b57b32c8a1a23e65ce3bf60cd1c027ca915))
12+
13+
## [v2.34.0](https://github.com/docling-project/docling-core/releases/tag/v2.34.0) - 2025-06-06
14+
15+
### Feature
16+
17+
* **doctags:** Add enclosing bbox to inline ([#302](https://github.com/docling-project/docling-core/issues/302)) ([`dcc198f`](https://github.com/docling-project/docling-core/commit/dcc198f7c6231fe8c781abe4a83194be2ee8d23b))
18+
* Add subscript & superscript formatting ([#319](https://github.com/docling-project/docling-core/issues/319)) ([`ae96129`](https://github.com/docling-project/docling-core/commit/ae961299a5f729acecf1b2b346113ac23e8b97f0))
19+
* Add table annotations ([#304](https://github.com/docling-project/docling-core/issues/304)) ([`d8a5256`](https://github.com/docling-project/docling-core/commit/d8a5256b2cb654ceb35a70be1b656ac7463ad335))
20+
21+
### Fix
22+
23+
* Fix misplaced list items ([#317](https://github.com/docling-project/docling-core/issues/317)) ([`c383f64`](https://github.com/docling-project/docling-core/commit/c383f64c44b4e1eb760d19d9422948fea127331c))
24+
25+
## [v2.33.1](https://github.com/docling-project/docling-core/releases/tag/v2.33.1) - 2025-06-04
26+
27+
### Fix
28+
29+
* New typer version with new click ([#315](https://github.com/docling-project/docling-core/issues/315)) ([`e17eabf`](https://github.com/docling-project/docling-core/commit/e17eabf0f92c3e3fde9f47fea083e44081bb9669))
30+
* Support section_header levels in doctags deserialization ([#313](https://github.com/docling-project/docling-core/issues/313)) ([`defd49e`](https://github.com/docling-project/docling-core/commit/defd49efae08c0cc2ce002f847724b6d65bd1407))
31+
132
## [v2.33.0](https://github.com/docling-project/docling-core/releases/tag/v2.33.0) - 2025-06-02
233

334
### Feature

docling_core/transforms/serializer/base.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,16 @@ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
202202
"""Hook for strikethrough formatting serialization."""
203203
...
204204

205+
@abstractmethod
206+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
207+
"""Hook for subscript formatting serialization."""
208+
...
209+
210+
@abstractmethod
211+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
212+
"""Hook for superscript formatting serialization."""
213+
...
214+
205215
@abstractmethod
206216
def serialize_hyperlink(
207217
self,
@@ -239,6 +249,15 @@ def serialize_captions(
239249
"""Serialize the item's captions."""
240250
...
241251

252+
@abstractmethod
253+
def serialize_annotations(
254+
self,
255+
item: DocItem,
256+
**kwargs: Any,
257+
) -> SerializationResult:
258+
"""Serialize the item's annotations."""
259+
...
260+
242261
@abstractmethod
243262
def get_excluded_refs(self, **kwargs: Any) -> set[str]:
244263
"""Get references to excluded items."""
@@ -257,3 +276,18 @@ class BaseSerializerProvider(ABC):
257276
def get_serializer(self, doc: DoclingDocument) -> BaseDocSerializer:
258277
"""Get a the associated serializer."""
259278
...
279+
280+
281+
class BaseAnnotationSerializer(ABC):
282+
"""Base class for annotation serializers."""
283+
284+
@abstractmethod
285+
def serialize(
286+
self,
287+
*,
288+
item: DocItem,
289+
doc: DoclingDocument,
290+
**kwargs: Any,
291+
) -> SerializationResult:
292+
"""Serializes the passed annotation."""
293+
...

docling_core/transforms/serializer/common.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from typing_extensions import Self, override
1616

1717
from docling_core.transforms.serializer.base import (
18+
BaseAnnotationSerializer,
1819
BaseDocSerializer,
1920
BaseFallbackSerializer,
2021
BaseFormSerializer,
@@ -30,6 +31,7 @@
3031
from docling_core.types.doc.document import (
3132
DOCUMENT_TOKENS_EXPORT_LABELS,
3233
ContentLayer,
34+
DescriptionAnnotation,
3335
DocItem,
3436
DoclingDocument,
3537
FloatingItem,
@@ -41,9 +43,10 @@
4143
OrderedList,
4244
PictureClassificationData,
4345
PictureDataType,
44-
PictureDescriptionData,
4546
PictureItem,
4647
PictureMoleculeData,
48+
Script,
49+
TableAnnotationType,
4750
TableItem,
4851
TextItem,
4952
UnorderedList,
@@ -122,7 +125,9 @@ def _iterate_items(
122125
yield item
123126

124127

125-
def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
128+
def _get_annotation_text(
129+
annotation: Union[PictureDataType, TableAnnotationType],
130+
) -> Optional[str]:
126131
result = None
127132
if isinstance(annotation, PictureClassificationData):
128133
predicted_class = (
@@ -132,7 +137,7 @@ def _get_picture_annotation_text(annotation: PictureDataType) -> Optional[str]:
132137
)
133138
if predicted_class is not None:
134139
result = predicted_class.replace("_", " ")
135-
elif isinstance(annotation, PictureDescriptionData):
140+
elif isinstance(annotation, DescriptionAnnotation):
136141
result = annotation.text
137142
elif isinstance(annotation, PictureMoleculeData):
138143
result = annotation.smi
@@ -211,6 +216,8 @@ class DocSerializer(BaseModel, BaseDocSerializer):
211216
list_serializer: BaseListSerializer
212217
inline_serializer: BaseInlineSerializer
213218

219+
annotation_serializer: BaseAnnotationSerializer
220+
214221
params: CommonParams = CommonParams()
215222

216223
_excluded_refs_cache: dict[str, set[str]] = {}
@@ -449,6 +456,10 @@ def post_process(
449456
res = self.serialize_underline(text=res)
450457
if formatting.strikethrough:
451458
res = self.serialize_strikethrough(text=res)
459+
if formatting.script == Script.SUB:
460+
res = self.serialize_subscript(text=res)
461+
elif formatting.script == Script.SUPER:
462+
res = self.serialize_superscript(text=res)
452463
if params.include_hyperlinks and hyperlink:
453464
res = self.serialize_hyperlink(text=res, hyperlink=hyperlink)
454465
return res
@@ -473,6 +484,16 @@ def serialize_strikethrough(self, text: str, **kwargs: Any) -> str:
473484
"""Hook for strikethrough formatting serialization."""
474485
return text
475486

487+
@override
488+
def serialize_subscript(self, text: str, **kwargs: Any) -> str:
489+
"""Hook for subscript formatting serialization."""
490+
return text
491+
492+
@override
493+
def serialize_superscript(self, text: str, **kwargs: Any) -> str:
494+
"""Hook for superscript formatting serialization."""
495+
return text
496+
476497
@override
477498
def serialize_hyperlink(
478499
self,
@@ -505,6 +526,19 @@ def serialize_captions(
505526
text_res = ""
506527
return create_ser_result(text=text_res, span_source=results)
507528

529+
@override
530+
def serialize_annotations(
531+
self,
532+
item: DocItem,
533+
**kwargs: Any,
534+
) -> SerializationResult:
535+
"""Serialize the item's annotations."""
536+
return self.annotation_serializer.serialize(
537+
item=item,
538+
doc=self.doc,
539+
**kwargs,
540+
)
541+
508542
def _get_applicable_pages(self) -> Optional[list[int]]:
509543
pages = {
510544
item.prov[0].page_no: ...

docling_core/transforms/serializer/doctags.py

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing_extensions import override
88

99
from docling_core.transforms.serializer.base import (
10+
BaseAnnotationSerializer,
1011
BaseDocSerializer,
1112
BaseFallbackSerializer,
1213
BaseFormSerializer,
@@ -17,12 +18,14 @@
1718
BaseTableSerializer,
1819
BaseTextSerializer,
1920
SerializationResult,
21+
Span,
2022
)
2123
from docling_core.transforms.serializer.common import (
2224
CommonParams,
2325
DocSerializer,
2426
create_ser_result,
2527
)
28+
from docling_core.types.doc.base import BoundingBox
2629
from docling_core.types.doc.document import (
2730
CodeItem,
2831
DocItem,
@@ -38,6 +41,7 @@
3841
PictureItem,
3942
PictureMoleculeData,
4043
PictureTabularChartData,
44+
ProvenanceItem,
4145
TableItem,
4246
TextItem,
4347
UnorderedList,
@@ -414,6 +418,39 @@ def serialize(
414418
class DocTagsInlineSerializer(BaseInlineSerializer):
415419
"""DocTags-specific inline group serializer."""
416420

421+
def _get_inline_location_tags(
422+
self, doc: DoclingDocument, item: InlineGroup, params: DocTagsParams
423+
) -> SerializationResult:
424+
425+
prov: Optional[ProvenanceItem] = None
426+
boxes: list[BoundingBox] = []
427+
doc_items: list[DocItem] = []
428+
for it, _ in doc.iterate_items(root=item):
429+
if isinstance(it, DocItem):
430+
for prov in it.prov:
431+
boxes.append(prov.bbox)
432+
doc_items.append(it)
433+
if prov is None:
434+
return create_ser_result()
435+
436+
bbox = BoundingBox.enclosing_bbox(boxes=boxes)
437+
438+
# using last seen prov as reference for page dims
439+
page_w, page_h = doc.pages[prov.page_no].size.as_tuple()
440+
441+
loc_str = DocumentToken.get_location(
442+
bbox=bbox.to_top_left_origin(page_h).as_tuple(),
443+
page_w=page_w,
444+
page_h=page_h,
445+
xsize=params.xsize,
446+
ysize=params.ysize,
447+
)
448+
449+
return SerializationResult(
450+
text=loc_str,
451+
spans=[Span(item=it) for it in doc_items],
452+
)
453+
417454
@override
418455
def serialize(
419456
self,
@@ -428,12 +465,23 @@ def serialize(
428465
"""Serializes the passed item."""
429466
my_visited = visited if visited is not None else set()
430467
params = DocTagsParams(**kwargs)
431-
parts = doc_serializer.get_parts(
432-
item=item,
433-
list_level=list_level,
434-
is_inline_scope=True,
435-
visited=my_visited,
436-
**kwargs,
468+
parts: List[SerializationResult] = []
469+
if params.add_location:
470+
inline_loc_tags_ser_res = self._get_inline_location_tags(
471+
doc=doc,
472+
item=item,
473+
params=params,
474+
)
475+
parts.append(inline_loc_tags_ser_res)
476+
params.add_location = False # suppress children location serialization
477+
parts.extend(
478+
doc_serializer.get_parts(
479+
item=item,
480+
list_level=list_level,
481+
is_inline_scope=True,
482+
visited=my_visited,
483+
**{**kwargs, **params.model_dump()},
484+
)
437485
)
438486
wrap_tag = DocumentToken.INLINE.value
439487
delim = _get_delim(params=params)
@@ -460,6 +508,15 @@ def serialize(
460508
return create_ser_result()
461509

462510

511+
class DocTagsAnnotationSerializer(BaseAnnotationSerializer):
512+
"""DocTags-specific annotation serializer."""
513+
514+
@override
515+
def serialize(self, *, item: DocItem, **kwargs: Any) -> SerializationResult:
516+
"""Serializes the item's annotations."""
517+
return create_ser_result()
518+
519+
463520
class DocTagsDocSerializer(DocSerializer):
464521
"""DocTags-specific document serializer."""
465522

@@ -473,6 +530,8 @@ class DocTagsDocSerializer(DocSerializer):
473530
list_serializer: BaseListSerializer = DocTagsListSerializer()
474531
inline_serializer: BaseInlineSerializer = DocTagsInlineSerializer()
475532

533+
annotation_serializer: BaseAnnotationSerializer = DocTagsAnnotationSerializer()
534+
476535
params: DocTagsParams = DocTagsParams()
477536

478537
@override

0 commit comments

Comments
 (0)