Skip to content

Commit a2b12ec

Browse files
committed
refactor(DoclingDocument): create a new provenance model for media file types
Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent 6ef9533 commit a2b12ec

File tree

11 files changed

+456
-106
lines changed

11 files changed

+456
-106
lines changed

docling_core/transforms/serializer/azure.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,22 +44,23 @@
4444
DocSerializer,
4545
create_ser_result,
4646
)
47-
from docling_core.types.doc.base import CoordOrigin
48-
from docling_core.types.doc.document import (
47+
from docling_core.types.doc import (
48+
CoordOrigin,
4949
DocItem,
50+
DocItemLabel,
5051
DoclingDocument,
5152
FormItem,
5253
InlineGroup,
5354
KeyValueItem,
5455
ListGroup,
5556
NodeItem,
5657
PictureItem,
58+
ProvenanceItem,
5759
RefItem,
5860
RichTableCell,
5961
TableItem,
6062
TextItem,
6163
)
62-
from docling_core.types.doc.labels import DocItemLabel
6364

6465

6566
def _bbox_to_polygon_coords(
@@ -78,7 +79,7 @@ def _bbox_to_polygon_for_item(
7879
doc: DoclingDocument, item: DocItem
7980
) -> Optional[list[float]]:
8081
"""Compute a TOPLEFT-origin polygon for the first provenance of the item."""
81-
if not item.prov:
82+
if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
8283
return None
8384

8485
prov = item.prov[0]
@@ -189,7 +190,7 @@ def serialize(
189190

190191
# Lists may be represented either as TextItem(ListItem) or via groups;
191192
# we treat any TextItem as a paragraph-like entry.
192-
if item.prov:
193+
if item.prov and isinstance(item.prov[0], ProvenanceItem):
193194
prov = item.prov[0]
194195
page_no = prov.page_no
195196
polygon = _bbox_to_polygon_for_item(doc, item)
@@ -241,7 +242,7 @@ def serialize(
241242
) -> SerializationResult:
242243
assert isinstance(doc_serializer, AzureDocSerializer)
243244

244-
if not item.prov:
245+
if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
245246
return create_ser_result()
246247

247248
prov = item.prov[0]
@@ -322,7 +323,7 @@ def serialize(
322323
) -> SerializationResult:
323324
assert isinstance(doc_serializer, AzureDocSerializer)
324325

325-
if not item.prov:
326+
if not item.prov or not isinstance(item.prov[0], ProvenanceItem):
326327
return create_ser_result()
327328

328329
prov = item.prov[0]
@@ -340,7 +341,11 @@ def serialize(
340341
for foot_ref in item.footnotes:
341342
if isinstance(foot_ref, RefItem):
342343
tgt = foot_ref.resolve(doc)
343-
if isinstance(tgt, TextItem) and tgt.prov:
344+
if (
345+
isinstance(tgt, TextItem)
346+
and tgt.prov
347+
and isinstance(tgt.prov[0], ProvenanceItem)
348+
):
344349
f_poly = _bbox_to_polygon_for_item(doc, tgt)
345350
if f_poly is not None:
346351
foots.append(

docling_core/transforms/serializer/common.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@
3434
SerializationResult,
3535
Span,
3636
)
37-
from docling_core.types.doc.document import (
38-
DOCUMENT_TOKENS_EXPORT_LABELS,
37+
from docling_core.types.doc import (
3938
ContentLayer,
4039
DescriptionAnnotation,
4140
DocItem,
41+
DocItemLabel,
4242
DoclingDocument,
4343
FloatingItem,
4444
Formatting,
@@ -51,12 +51,13 @@
5151
PictureDataType,
5252
PictureItem,
5353
PictureMoleculeData,
54+
ProvenanceItem,
5455
Script,
5556
TableAnnotationType,
5657
TableItem,
5758
TextItem,
5859
)
59-
from docling_core.types.doc.labels import DocItemLabel
60+
from docling_core.types.doc.document import DOCUMENT_TOKENS_EXPORT_LABELS
6061

6162
_DEFAULT_LABELS = DOCUMENT_TOKENS_EXPORT_LABELS
6263
_DEFAULT_LAYERS = {cl for cl in ContentLayer}
@@ -110,7 +111,11 @@ def _iterate_items(
110111
add_page_breaks=add_page_breaks,
111112
visited=my_visited,
112113
):
113-
if isinstance(it, DocItem) and it.prov:
114+
if (
115+
isinstance(it, DocItem)
116+
and it.prov
117+
and isinstance(it.prov[0], ProvenanceItem)
118+
):
114119
page_no = it.prov[0].page_no
115120
if prev_page_nr is not None and page_no > prev_page_nr:
116121
yield _PageBreakNode(
@@ -119,7 +124,11 @@ def _iterate_items(
119124
next_page=page_no,
120125
), lvl
121126
break
122-
elif isinstance(item, DocItem) and item.prov:
127+
elif (
128+
isinstance(item, DocItem)
129+
and item.prov
130+
and isinstance(item.prov[0], ProvenanceItem)
131+
):
123132
page_no = item.prov[0].page_no
124133
if prev_page_nr is None or page_no > prev_page_nr:
125134
if prev_page_nr is not None: # close previous range
@@ -288,7 +297,10 @@ def get_excluded_refs(self, **kwargs: Any) -> set[str]:
288297
params.pages is not None
289298
and (
290299
(not item.prov)
291-
or item.prov[0].page_no not in params.pages
300+
or (
301+
isinstance(item.prov[0], ProvenanceItem)
302+
and item.prov[0].page_no not in params.pages
303+
)
292304
)
293305
)
294306
)
@@ -635,6 +647,7 @@ def _get_applicable_pages(self) -> Optional[list[int]]:
635647
if (
636648
isinstance(item, DocItem)
637649
and item.prov
650+
and isinstance(item.prov[0], ProvenanceItem)
638651
and (
639652
self.params.pages is None
640653
or item.prov[0].page_no in self.params.pages

docling_core/transforms/serializer/doctags.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@
2626
_should_use_legacy_annotations,
2727
create_ser_result,
2828
)
29-
from docling_core.types.doc.base import BoundingBox
3029
from docling_core.types.doc.document import (
30+
BoundingBox,
3131
CodeItem,
3232
DocItem,
33+
DocItemLabel,
3334
DoclingDocument,
35+
DocumentToken,
3436
FloatingItem,
3537
FormItem,
3638
GroupItem,
@@ -40,17 +42,17 @@
4042
ListItem,
4143
NodeItem,
4244
PictureClassificationData,
45+
PictureClassificationLabel,
4346
PictureItem,
4447
PictureMoleculeData,
4548
PictureTabularChartData,
4649
ProvenanceItem,
4750
SectionHeaderItem,
4851
TableData,
4952
TableItem,
53+
TableToken,
5054
TextItem,
5155
)
52-
from docling_core.types.doc.labels import DocItemLabel, PictureClassificationLabel
53-
from docling_core.types.doc.tokens import DocumentToken, TableToken
5456

5557

5658
def _wrap(text: str, wrap_tag: str) -> str:
@@ -360,7 +362,7 @@ def serialize(
360362
results: list[SerializationResult] = []
361363

362364
page_no = 1
363-
if len(item.prov) > 0:
365+
if len(item.prov) > 0 and isinstance(item.prov[0], ProvenanceItem):
364366
page_no = item.prov[0].page_no
365367

366368
if params.add_location:
@@ -380,7 +382,7 @@ def serialize(
380382

381383
for cell in item.graph.cells:
382384
cell_txt = ""
383-
if cell.prov is not None:
385+
if cell.prov is not None and isinstance(cell.prov, ProvenanceItem):
384386
if len(doc.pages.keys()):
385387
page_w, page_h = doc.pages[page_no].size.as_tuple()
386388
cell_txt += DocumentToken.get_location(
@@ -492,7 +494,7 @@ def _get_inline_location_tags(
492494
doc_items: list[DocItem] = []
493495
for it, _ in doc.iterate_items(root=item):
494496
if isinstance(it, DocItem):
495-
for prov in it.prov:
497+
for prov in (im for im in it.prov if isinstance(im, ProvenanceItem)):
496498
boxes.append(prov.bbox)
497499
doc_items.append(it)
498500
if prov is None:

docling_core/transforms/visualizer/key_value_visualizer.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,13 @@
1616
from typing_extensions import override
1717

1818
from docling_core.transforms.visualizer.base import BaseVisualizer
19-
from docling_core.types.doc.document import ContentLayer, DoclingDocument
20-
from docling_core.types.doc.labels import GraphCellLabel, GraphLinkLabel
19+
from docling_core.types.doc import (
20+
ContentLayer,
21+
DoclingDocument,
22+
GraphCellLabel,
23+
GraphLinkLabel,
24+
ProvenanceItem,
25+
)
2126

2227
# ---------------------------------------------------------------------------
2328
# Helper functions / constants
@@ -78,7 +83,11 @@ def _draw_key_value_layer(
7883
# First draw cells (rectangles + optional labels)
7984
# ------------------------------------------------------------------
8085
for cell in cell_dict.values():
81-
if cell.prov is None or cell.prov.page_no != page_no:
86+
if (
87+
cell.prov is None
88+
or not isinstance(cell.prov, ProvenanceItem)
89+
or cell.prov.page_no != page_no
90+
):
8291
continue # skip cells not on this page or without bbox
8392

8493
tl_bbox = cell.prov.bbox.to_top_left_origin(
@@ -127,6 +136,8 @@ def _draw_key_value_layer(
127136
if (
128137
src_cell.prov is None
129138
or tgt_cell.prov is None
139+
or not isinstance(src_cell.prov, ProvenanceItem)
140+
or not isinstance(tgt_cell.prov, ProvenanceItem)
130141
or src_cell.prov.page_no != page_no
131142
or tgt_cell.prov.page_no != page_no
132143
):

docling_core/transforms/visualizer/layout_visualizer.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,16 @@
1010
from typing_extensions import override
1111

1212
from docling_core.transforms.visualizer.base import BaseVisualizer
13-
from docling_core.types.doc import DocItemLabel
14-
from docling_core.types.doc.base import CoordOrigin
15-
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
16-
from docling_core.types.doc.page import BoundingRectangle, TextCell
13+
from docling_core.types.doc import (
14+
BoundingRectangle,
15+
ContentLayer,
16+
CoordOrigin,
17+
DocItem,
18+
DocItemLabel,
19+
DoclingDocument,
20+
ProvenanceItem,
21+
TextCell,
22+
)
1723

1824

1925
class _TLBoundingRectangle(BoundingRectangle):
@@ -157,7 +163,9 @@ def _draw_doc_layout(
157163
if len(elem.prov) == 0:
158164
continue # Skip elements without provenances
159165

160-
for prov in elem.prov:
166+
for prov in (
167+
item for item in elem.prov if isinstance(item, ProvenanceItem)
168+
):
161169
page_nr = prov.page_no
162170

163171
if page_nr in my_images:

docling_core/transforms/visualizer/reading_order_visualizer.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010
from typing_extensions import override
1111

1212
from docling_core.transforms.visualizer.base import BaseVisualizer
13-
from docling_core.types.doc.document import ContentLayer, DocItem, DoclingDocument
13+
from docling_core.types.doc.document import (
14+
ContentLayer,
15+
DocItem,
16+
DoclingDocument,
17+
ProvenanceItem,
18+
)
1419

1520

1621
class _NumberDrawingData(BaseModel):
@@ -102,7 +107,9 @@ def _draw_doc_reading_order(
102107
if len(elem.prov) == 0:
103108
continue # Skip elements without provenances
104109

105-
for prov in elem.prov:
110+
for prov in (
111+
item for item in elem.prov if isinstance(item, ProvenanceItem)
112+
):
106113
page_no = prov.page_no
107114
image = my_images.get(page_no)
108115

docling_core/transforms/visualizer/table_visualizer.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,12 @@
1010
from typing_extensions import override
1111

1212
from docling_core.transforms.visualizer.base import BaseVisualizer
13-
from docling_core.types.doc.document import ContentLayer, DoclingDocument, TableItem
13+
from docling_core.types.doc import (
14+
ContentLayer,
15+
DoclingDocument,
16+
ProvenanceItem,
17+
TableItem,
18+
)
1419

1520
_log = logging.getLogger(__name__)
1621

@@ -171,12 +176,12 @@ def _draw_doc_tables(
171176
image = deepcopy(pil_img)
172177
my_images[page_nr] = image
173178

174-
for idx, (elem, _) in enumerate(
179+
for _, (elem, _) in enumerate(
175180
doc.iterate_items(included_content_layers=included_content_layers)
176181
):
177182
if not isinstance(elem, TableItem):
178183
continue
179-
if len(elem.prov) == 0:
184+
if len(elem.prov) == 0 or not isinstance(elem.prov[0], ProvenanceItem):
180185
continue # Skip elements without provenances
181186

182187
if len(elem.prov) == 1:

docling_core/types/doc/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
Script,
6262
SectionHeaderItem,
6363
SummaryMetaField,
64+
TableAnnotationType,
6465
TableCell,
6566
TableData,
6667
TableItem,

0 commit comments

Comments
 (0)