Skip to content

Commit 59e6bb5

Browse files
committed
feat: add ref number normalization
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 076ad2b commit 59e6bb5

File tree

4 files changed

+88
-22
lines changed

4 files changed

+88
-22
lines changed

docling_core/types/doc/document.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4318,3 +4318,68 @@ def validate_misplaced_list_items(self):
43184318
hyperlink=li.hyperlink,
43194319
)
43204320
return self
4321+
4322+
def normalize(self) -> None:
4323+
"""Normalize ref numbering by ordering node items as per iterate_items()."""
4324+
new_body = copy.deepcopy(self.body)
4325+
new_body.children = []
4326+
4327+
item_lists: dict[str, list[NodeItem]] = {
4328+
"groups": [],
4329+
"texts": [],
4330+
"pictures": [],
4331+
"tables": [],
4332+
"key_value_items": [],
4333+
"form_items": [],
4334+
}
4335+
orig_ref_to_new_ref: dict[str, str] = {}
4336+
4337+
# collect items in traversal order
4338+
for item, _ in self.iterate_items(
4339+
with_groups=True,
4340+
traverse_pictures=True,
4341+
included_content_layers={c for c in ContentLayer},
4342+
):
4343+
key = item.self_ref.split("/")[1]
4344+
is_body = key == "body"
4345+
new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
4346+
# register cref mapping:
4347+
orig_ref_to_new_ref[item.self_ref] = new_cref
4348+
4349+
if not is_body:
4350+
new_item = copy.deepcopy(item)
4351+
new_item.children = []
4352+
4353+
# put item in the right list
4354+
item_lists[key].append(new_item)
4355+
4356+
# update item's self reference
4357+
new_item.self_ref = new_cref
4358+
4359+
if item.parent:
4360+
# set item's parent
4361+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
4362+
new_item.parent = RefItem(cref=new_parent_cref)
4363+
4364+
# add item to parent's children
4365+
path_components = new_parent_cref.split("/")
4366+
num_components = len(path_components)
4367+
parent_node: NodeItem
4368+
if num_components == 3:
4369+
_, parent_key, parent_index_str = path_components
4370+
parent_index = int(parent_index_str)
4371+
parent_node = item_lists[parent_key][parent_index]
4372+
elif num_components == 2 and path_components[1] == "body":
4373+
parent_node = new_body
4374+
else:
4375+
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
4376+
parent_node.children.append(RefItem(cref=new_cref))
4377+
4378+
# update document
4379+
self.groups = item_lists["groups"] # type: ignore
4380+
self.texts = item_lists["texts"] # type: ignore
4381+
self.pictures = item_lists["pictures"] # type: ignore
4382+
self.tables = item_lists["tables"] # type: ignore
4383+
self.key_value_items = item_lists["key_value_items"] # type: ignore
4384+
self.form_items = item_lists["form_items"] # type: ignore
4385+
self.body = new_body

test/data/doc/dummy_doc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ texts:
9898

9999

100100
tables: # All tables...
101-
- self_ref: "#/table/0"
101+
- self_ref: "#/tables/0"
102102
label: "table"
103103
parent:
104104
$ref: "#/body"
Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
body:
22
children:
3-
- $ref: '#/groups/1'
4-
- $ref: '#/texts/0'
53
- $ref: '#/groups/0'
4+
- $ref: '#/texts/1'
5+
- $ref: '#/groups/1'
66
content_layer: body
77
label: unspecified
88
name: _root_
@@ -16,18 +16,18 @@ furniture:
1616
self_ref: '#/furniture'
1717
groups:
1818
- children:
19-
- $ref: '#/texts/1'
20-
- $ref: '#/texts/2'
19+
- $ref: '#/texts/0'
2120
content_layer: body
22-
label: list
21+
label: ordered_list
2322
name: group
2423
parent:
2524
$ref: '#/body'
2625
self_ref: '#/groups/0'
2726
- children:
27+
- $ref: '#/texts/2'
2828
- $ref: '#/texts/3'
2929
content_layer: body
30-
label: ordered_list
30+
label: list
3131
name: group
3232
parent:
3333
$ref: '#/body'
@@ -39,14 +39,25 @@ pictures: []
3939
schema_name: DoclingDocument
4040
tables: []
4141
texts:
42+
- children: []
43+
content_layer: body
44+
enumerated: true
45+
label: list_item
46+
marker: '1.'
47+
orig: foo
48+
parent:
49+
$ref: '#/groups/0'
50+
prov: []
51+
self_ref: '#/texts/0'
52+
text: foo
4253
- children: []
4354
content_layer: body
4455
label: text
4556
orig: bar
4657
parent:
4758
$ref: '#/body'
4859
prov: []
49-
self_ref: '#/texts/0'
60+
self_ref: '#/texts/1'
5061
text: bar
5162
- children: []
5263
content_layer: body
@@ -55,30 +66,19 @@ texts:
5566
marker: '-'
5667
orig: here
5768
parent:
58-
$ref: '#/groups/0'
69+
$ref: '#/groups/1'
5970
prov: []
60-
self_ref: '#/texts/1'
71+
self_ref: '#/texts/2'
6172
text: here
6273
- children: []
6374
content_layer: body
6475
enumerated: false
6576
label: list_item
6677
marker: '-'
6778
orig: there
68-
parent:
69-
$ref: '#/groups/0'
70-
prov: []
71-
self_ref: '#/texts/2'
72-
text: there
73-
- children: []
74-
content_layer: body
75-
enumerated: true
76-
label: list_item
77-
marker: '1.'
78-
orig: foo
7979
parent:
8080
$ref: '#/groups/1'
8181
prov: []
8282
self_ref: '#/texts/3'
83-
text: foo
83+
text: there
8484
version: 1.4.0

test/test_docling_doc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1626,6 +1626,7 @@ def _verify(filename: Path, document: DoclingDocument, generate: bool = False):
16261626
def test_misplaced_list_items():
16271627
filename = Path("test/data/doc/misplaced_list_items.yaml")
16281628
doc = DoclingDocument.load_from_yaml(filename)
1629+
doc.normalize()
16291630

16301631
dt_pred = doc.export_to_doctags()
16311632
_verify_regression_test(dt_pred, filename=str(filename), ext="dt")

0 commit comments

Comments
 (0)