Skip to content

Commit 4434b10

Browse files
committed
Merge branch 'cau/legacy-add-payload-field' of github.com:DS4SD/docling-core into feat-add-legacy-convert
2 parents f1001d8 + 1883b5f commit 4434b10

File tree

5 files changed

+19
-23
lines changed

5 files changed

+19
-23
lines changed

docling_core/types/doc/document.py

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1576,7 +1576,7 @@ def iterate_items(
15761576
self,
15771577
root: Optional[NodeItem] = None,
15781578
with_groups: bool = False,
1579-
traverse_pictures: bool = True,
1579+
traverse_pictures: bool = False,
15801580
page_no: Optional[int] = None,
15811581
_level: int = 0, # fixed parameter, carries through the node nesting level
15821582
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
@@ -1593,30 +1593,31 @@ def iterate_items(
15931593
if not root:
15941594
root = self.body
15951595

1596+
# Yield non-group items or group items when with_groups=True
15961597
if not isinstance(root, GroupItem) or with_groups:
15971598
if isinstance(root, DocItem):
1598-
if page_no is not None:
1599-
for prov in root.prov:
1600-
if prov.page_no == page_no:
1601-
yield root, _level
1602-
else:
1599+
if page_no is None or any(
1600+
prov.page_no == page_no for prov in root.prov
1601+
):
16031602
yield root, _level
16041603
else:
16051604
yield root, _level
16061605

1606+
# Handle picture traversal - only traverse children if requested
1607+
if isinstance(root, PictureItem) and not traverse_pictures:
1608+
return
1609+
16071610
# Traverse children
16081611
for child_ref in root.children:
16091612
child = child_ref.resolve(self)
1610-
16111613
if isinstance(child, NodeItem):
1612-
# If the child is a NodeItem, recursively traverse it
1613-
if not isinstance(child, PictureItem) or traverse_pictures:
1614-
yield from self.iterate_items(
1615-
child,
1616-
_level=_level + 1,
1617-
with_groups=with_groups,
1618-
page_no=page_no,
1619-
)
1614+
yield from self.iterate_items(
1615+
child,
1616+
with_groups=with_groups,
1617+
traverse_pictures=traverse_pictures,
1618+
page_no=page_no,
1619+
_level=_level + 1,
1620+
)
16201621

16211622
def _clear_picture_pil_cache(self):
16221623
"""Clear cache storage of all images."""

test/data/doc/dummy_doc.yaml.dt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
<location><page_1><loc_59><loc_0><loc_91><loc_75></location>
55
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
66
</figure>
7-
<section_header><location><page_1><loc_42><loc_57><loc_49><loc_61></location>OPERATION (cont.)</section_header>
87
<table>
98
<location><page_1><loc_42><loc_57><loc_49><loc_61></location>
109
</table>

test/data/doc/dummy_doc.yaml.et

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
0: unspecified with name=_root_
22
1: title
33
2: picture
4-
3: section_header
5-
4: caption
6-
5: table
4+
3: caption
5+
4: table

test/data/doc/dummy_doc.yaml.html

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,5 @@
5757
</head>
5858
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
5959
<figure><figcaption>Figure 1: Four examples of complex page layouts across different document categories</figcaption></figure>
60-
<h2>OPERATION (cont.)</h2>
6160

6261
</html>

test/data/doc/dummy_doc.yaml.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,4 @@
22

33
Figure 1: Four examples of complex page layouts across different document categories
44

5-
<!-- image -->
6-
7-
## OPERATION (cont.)
5+
<!-- image -->

0 commit comments

Comments
 (0)