Skip to content

Commit

Permalink
Merge branch 'cau/legacy-add-payload-field' of github.com:DS4SD/docli…
Browse files Browse the repository at this point in the history
…ng-core into feat-add-legacy-convert
  • Loading branch information
cau-git committed Dec 3, 2024
2 parents f1001d8 + 1883b5f commit 4434b10
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 23 deletions.
31 changes: 16 additions & 15 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,7 +1576,7 @@ def iterate_items(
self,
root: Optional[NodeItem] = None,
with_groups: bool = False,
traverse_pictures: bool = True,
traverse_pictures: bool = False,
page_no: Optional[int] = None,
_level: int = 0, # fixed parameter, carries through the node nesting level
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
Expand All @@ -1593,30 +1593,31 @@ def iterate_items(
if not root:
root = self.body

# Yield non-group items or group items when with_groups=True
if not isinstance(root, GroupItem) or with_groups:
if isinstance(root, DocItem):
if page_no is not None:
for prov in root.prov:
if prov.page_no == page_no:
yield root, _level
else:
if page_no is None or any(
prov.page_no == page_no for prov in root.prov
):
yield root, _level
else:
yield root, _level

# Handle picture traversal - only traverse children if requested
if isinstance(root, PictureItem) and not traverse_pictures:
return

# Traverse children
for child_ref in root.children:
child = child_ref.resolve(self)

if isinstance(child, NodeItem):
# If the child is a NodeItem, recursively traverse it
if not isinstance(child, PictureItem) or traverse_pictures:
yield from self.iterate_items(
child,
_level=_level + 1,
with_groups=with_groups,
page_no=page_no,
)
yield from self.iterate_items(
child,
with_groups=with_groups,
traverse_pictures=traverse_pictures,
page_no=page_no,
_level=_level + 1,
)

def _clear_picture_pil_cache(self):
"""Clear cache storage of all images."""
Expand Down
1 change: 0 additions & 1 deletion test/data/doc/dummy_doc.yaml.dt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
<location><page_1><loc_59><loc_0><loc_91><loc_75></location>
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
</figure>
<section_header><location><page_1><loc_42><loc_57><loc_49><loc_61></location>OPERATION (cont.)</section_header>
<table>
<location><page_1><loc_42><loc_57><loc_49><loc_61></location>
</table>
Expand Down
5 changes: 2 additions & 3 deletions test/data/doc/dummy_doc.yaml.et
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
0: unspecified with name=_root_
1: title
2: picture
3: section_header
4: caption
5: table
3: caption
4: table
1 change: 0 additions & 1 deletion test/data/doc/dummy_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,5 @@
</head>
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
<figure><figcaption>Figure 1: Four examples of complex page layouts across different document categories</figcaption></figure>
<h2>OPERATION (cont.)</h2>

</html>
4 changes: 1 addition & 3 deletions test/data/doc/dummy_doc.yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,4 @@

Figure 1: Four examples of complex page layouts across different document categories

<!-- image -->

## OPERATION (cont.)
<!-- image -->

0 comments on commit 4434b10

Please sign in to comment.