Skip to content

Commit fc073ce

Browse files
committed
chore: add ref number normalization
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 7094828 commit fc073ce

File tree

4 files changed

+157
-1
lines changed

4 files changed

+157
-1
lines changed

docling_core/types/doc/document.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4318,3 +4318,67 @@ def validate_misplaced_list_items(self):
43184318
hyperlink=li.hyperlink,
43194319
)
43204320
return self
4321+
4322+
def _normalize_references(self) -> None:
4323+
"""Normalize ref numbering by ordering node items as per iterate_items()."""
4324+
new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
4325+
4326+
item_lists: dict[str, list[NodeItem]] = {
4327+
"groups": [],
4328+
"texts": [],
4329+
"pictures": [],
4330+
"tables": [],
4331+
"key_value_items": [],
4332+
"form_items": [],
4333+
}
4334+
orig_ref_to_new_ref: dict[str, str] = {}
4335+
4336+
# collect items in traversal order
4337+
for item, _ in self.iterate_items(
4338+
with_groups=True,
4339+
traverse_pictures=True,
4340+
included_content_layers={c for c in ContentLayer},
4341+
):
4342+
key = item.self_ref.split("/")[1]
4343+
is_body = key == "body"
4344+
new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
4345+
# register cref mapping:
4346+
orig_ref_to_new_ref[item.self_ref] = new_cref
4347+
4348+
if not is_body:
4349+
new_item = copy.deepcopy(item)
4350+
new_item.children = []
4351+
4352+
# put item in the right list
4353+
item_lists[key].append(new_item)
4354+
4355+
# update item's self reference
4356+
new_item.self_ref = new_cref
4357+
4358+
if item.parent:
4359+
# set item's parent
4360+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
4361+
new_item.parent = RefItem(cref=new_parent_cref)
4362+
4363+
# add item to parent's children
4364+
path_components = new_parent_cref.split("/")
4365+
num_components = len(path_components)
4366+
parent_node: NodeItem
4367+
if num_components == 3:
4368+
_, parent_key, parent_index_str = path_components
4369+
parent_index = int(parent_index_str)
4370+
parent_node = item_lists[parent_key][parent_index]
4371+
elif num_components == 2 and path_components[1] == "body":
4372+
parent_node = new_body
4373+
else:
4374+
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
4375+
parent_node.children.append(RefItem(cref=new_cref))
4376+
4377+
# update document
4378+
self.groups = item_lists["groups"] # type: ignore
4379+
self.texts = item_lists["texts"] # type: ignore
4380+
self.pictures = item_lists["pictures"] # type: ignore
4381+
self.tables = item_lists["tables"] # type: ignore
4382+
self.key_value_items = item_lists["key_value_items"] # type: ignore
4383+
self.form_items = item_lists["form_items"] # type: ignore
4384+
self.body = new_body

test/data/doc/dummy_doc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ texts:
9898

9999

100100
tables: # All tables...
101-
- self_ref: "#/table/0"
101+
- self_ref: "#/tables/0"
102102
label: "table"
103103
parent:
104104
$ref: "#/body"
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
body:
2+
children:
3+
- $ref: '#/groups/0'
4+
- $ref: '#/texts/1'
5+
- $ref: '#/groups/1'
6+
content_layer: body
7+
label: unspecified
8+
name: _root_
9+
self_ref: '#/body'
10+
form_items: []
11+
furniture:
12+
children: []
13+
content_layer: furniture
14+
label: unspecified
15+
name: _root_
16+
self_ref: '#/furniture'
17+
groups:
18+
- children:
19+
- $ref: '#/texts/0'
20+
content_layer: body
21+
label: ordered_list
22+
name: group
23+
parent:
24+
$ref: '#/body'
25+
self_ref: '#/groups/0'
26+
- children:
27+
- $ref: '#/texts/2'
28+
- $ref: '#/texts/3'
29+
content_layer: body
30+
label: list
31+
name: group
32+
parent:
33+
$ref: '#/body'
34+
self_ref: '#/groups/1'
35+
key_value_items: []
36+
name: ''
37+
pages: {}
38+
pictures: []
39+
schema_name: DoclingDocument
40+
tables: []
41+
texts:
42+
- children: []
43+
content_layer: body
44+
enumerated: true
45+
label: list_item
46+
marker: '1.'
47+
orig: foo
48+
parent:
49+
$ref: '#/groups/0'
50+
prov: []
51+
self_ref: '#/texts/0'
52+
text: foo
53+
- children: []
54+
content_layer: body
55+
label: text
56+
orig: bar
57+
parent:
58+
$ref: '#/body'
59+
prov: []
60+
self_ref: '#/texts/1'
61+
text: bar
62+
- children: []
63+
content_layer: body
64+
enumerated: false
65+
label: list_item
66+
marker: '-'
67+
orig: here
68+
parent:
69+
$ref: '#/groups/1'
70+
prov: []
71+
self_ref: '#/texts/2'
72+
text: here
73+
- children: []
74+
content_layer: body
75+
enumerated: false
76+
label: list_item
77+
marker: '-'
78+
orig: there
79+
parent:
80+
$ref: '#/groups/1'
81+
prov: []
82+
self_ref: '#/texts/3'
83+
text: there
84+
version: 1.4.0

test/test_docling_doc.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1636,3 +1636,11 @@ def test_misplaced_list_items():
16361636
else:
16371637
exp_doc = DoclingDocument.load_from_yaml(exp_file)
16381638
assert doc == exp_doc
1639+
1640+
doc._normalize_references()
1641+
exp_file = filename.parent / f"{filename.stem}.norm.out.yaml"
1642+
if GEN_TEST_DATA:
1643+
doc.save_as_yaml(exp_file)
1644+
else:
1645+
exp_doc = DoclingDocument.load_from_yaml(exp_file)
1646+
assert doc == exp_doc

0 commit comments

Comments
 (0)