Skip to content

Commit 5d41805

Browse files
committed
feat: add ref number normalization
Signed-off-by: Panos Vagenas <[email protected]>
1 parent c383f64 commit 5d41805

File tree

4 files changed

+88
-22
lines changed

4 files changed

+88
-22
lines changed

docling_core/types/doc/document.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4264,3 +4264,68 @@ def validate_misplaced_list_items(self):
42644264
hyperlink=li.hyperlink,
42654265
)
42664266
return self
4267+
4268+
def normalize(self) -> None:
4269+
"""Normalize ref numbering by ordering node items as per iterate_items()."""
4270+
new_body = copy.deepcopy(self.body)
4271+
new_body.children = []
4272+
4273+
item_lists: dict[str, list[NodeItem]] = {
4274+
"groups": [],
4275+
"texts": [],
4276+
"pictures": [],
4277+
"tables": [],
4278+
"key_value_items": [],
4279+
"form_items": [],
4280+
}
4281+
orig_ref_to_new_ref: dict[str, str] = {}
4282+
4283+
# collect items in traversal order
4284+
for item, _ in self.iterate_items(
4285+
with_groups=True,
4286+
traverse_pictures=True,
4287+
included_content_layers={c for c in ContentLayer},
4288+
):
4289+
key = item.self_ref.split("/")[1]
4290+
is_body = key == "body"
4291+
new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
4292+
# register cref mapping:
4293+
orig_ref_to_new_ref[item.self_ref] = new_cref
4294+
4295+
if not is_body:
4296+
new_item = copy.deepcopy(item)
4297+
new_item.children = []
4298+
4299+
# put item in the right list
4300+
item_lists[key].append(new_item)
4301+
4302+
# update item's self reference
4303+
new_item.self_ref = new_cref
4304+
4305+
if item.parent:
4306+
# set item's parent
4307+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
4308+
new_item.parent = RefItem(cref=new_parent_cref)
4309+
4310+
# add item to parent's children
4311+
path_components = new_parent_cref.split("/")
4312+
num_components = len(path_components)
4313+
parent_node: NodeItem
4314+
if num_components == 3:
4315+
_, parent_key, parent_index_str = path_components
4316+
parent_index = int(parent_index_str)
4317+
parent_node = item_lists[parent_key][parent_index]
4318+
elif num_components == 2 and path_components[1] == "body":
4319+
parent_node = new_body
4320+
else:
4321+
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
4322+
parent_node.children.append(RefItem(cref=new_cref))
4323+
4324+
# update document
4325+
self.groups = item_lists["groups"] # type: ignore
4326+
self.texts = item_lists["texts"] # type: ignore
4327+
self.pictures = item_lists["pictures"] # type: ignore
4328+
self.tables = item_lists["tables"] # type: ignore
4329+
self.key_value_items = item_lists["key_value_items"] # type: ignore
4330+
self.form_items = item_lists["form_items"] # type: ignore
4331+
self.body = new_body

test/data/doc/dummy_doc.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ texts:
9898

9999

100100
tables: # All tables...
101-
- self_ref: "#/table/0"
101+
- self_ref: "#/tables/0"
102102
label: "table"
103103
parent:
104104
$ref: "#/body"
Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
body:
22
children:
3-
- $ref: '#/groups/1'
4-
- $ref: '#/texts/0'
53
- $ref: '#/groups/0'
4+
- $ref: '#/texts/1'
5+
- $ref: '#/groups/1'
66
content_layer: body
77
label: unspecified
88
name: _root_
@@ -16,18 +16,18 @@ furniture:
1616
self_ref: '#/furniture'
1717
groups:
1818
- children:
19-
- $ref: '#/texts/1'
20-
- $ref: '#/texts/2'
19+
- $ref: '#/texts/0'
2120
content_layer: body
22-
label: list
21+
label: ordered_list
2322
name: group
2423
parent:
2524
$ref: '#/body'
2625
self_ref: '#/groups/0'
2726
- children:
27+
- $ref: '#/texts/2'
2828
- $ref: '#/texts/3'
2929
content_layer: body
30-
label: ordered_list
30+
label: list
3131
name: group
3232
parent:
3333
$ref: '#/body'
@@ -39,14 +39,25 @@ pictures: []
3939
schema_name: DoclingDocument
4040
tables: []
4141
texts:
42+
- children: []
43+
content_layer: body
44+
enumerated: true
45+
label: list_item
46+
marker: '1.'
47+
orig: foo
48+
parent:
49+
$ref: '#/groups/0'
50+
prov: []
51+
self_ref: '#/texts/0'
52+
text: foo
4253
- children: []
4354
content_layer: body
4455
label: text
4556
orig: bar
4657
parent:
4758
$ref: '#/body'
4859
prov: []
49-
self_ref: '#/texts/0'
60+
self_ref: '#/texts/1'
5061
text: bar
5162
- children: []
5263
content_layer: body
@@ -55,30 +66,19 @@ texts:
5566
marker: '-'
5667
orig: here
5768
parent:
58-
$ref: '#/groups/0'
69+
$ref: '#/groups/1'
5970
prov: []
60-
self_ref: '#/texts/1'
71+
self_ref: '#/texts/2'
6172
text: here
6273
- children: []
6374
content_layer: body
6475
enumerated: false
6576
label: list_item
6677
marker: '-'
6778
orig: there
68-
parent:
69-
$ref: '#/groups/0'
70-
prov: []
71-
self_ref: '#/texts/2'
72-
text: there
73-
- children: []
74-
content_layer: body
75-
enumerated: true
76-
label: list_item
77-
marker: '1.'
78-
orig: foo
7979
parent:
8080
$ref: '#/groups/1'
8181
prov: []
8282
self_ref: '#/texts/3'
83-
text: foo
83+
text: there
8484
version: 1.4.0

test/test_docling_doc.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1624,6 +1624,7 @@ def _verify(filename: Path, document: DoclingDocument, generate: bool = False):
16241624
def test_misplaced_list_items():
16251625
filename = Path("test/data/doc/misplaced_list_items.yaml")
16261626
doc = DoclingDocument.load_from_yaml(filename)
1627+
doc.normalize()
16271628

16281629
dt_pred = doc.export_to_doctags()
16291630
_verify_regression_test(dt_pred, filename=str(filename), ext="dt")

0 commit comments

Comments
 (0)