Skip to content

Commit 9f026f0

Browse files
committed
Resolve nested clusters for DoclingDocument
Signed-off-by: Christoph Auer <[email protected]>
1 parent 5beeb00 commit 9f026f0

File tree

1 file changed

+33
-0
lines changed

1 file changed

+33
-0
lines changed

deepsearch_glm/utils/doc_utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import re
23
from pathlib import Path
34
from typing import List
@@ -157,6 +158,7 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
157158

158159
pic = doc.add_picture(prov=prov)
159160
pic.captions.extend(caption_refs)
161+
_add_child_elements(pic, doc, obj, pelem)
160162

161163
elif ptype == "table":
162164
current_list = None
@@ -250,6 +252,13 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
250252
tbl = doc.add_table(data=tbl_data, prov=prov)
251253
tbl.captions.extend(caption_refs)
252254

255+
elif ptype in ["form", "key_value_region"]:
256+
257+
label = DocItemLabel(ptype)
258+
container_el = doc.add_group(label=label, name=label)
259+
260+
_add_child_elements(container_el, doc, obj, pelem)
261+
253262
elif "text" in obj:
254263
text = obj["text"][span_i:span_j]
255264

@@ -297,6 +306,30 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
297306
return doc
298307

299308

309+
def _add_child_elements(container_el, doc, obj, pelem):
310+
children = obj.get("payload", [])
311+
for child in children:
312+
c_label = child["label"]
313+
c_bbox = BoundingBox.model_validate(child["bbox"])
314+
c_text = " ".join([
315+
cell["text"].replace("\x02", "-").strip()
316+
for cell in child["cells"]
317+
if len(cell["text"].strip()) > 0
318+
])
319+
320+
c_prov = ProvenanceItem(
321+
page_no=pelem["page"],
322+
charspan=(0, len(c_text)),
323+
bbox=c_bbox
324+
)
325+
if c_label == DocItemLabel.LIST_ITEM:
326+
# TODO: Infer if this is a numbered or a bullet list item
327+
doc.add_list_item(parent=container_el, label=c_label, text=c_text, prov=c_prov)
328+
elif c_label == DocItemLabel.SECTION_HEADER:
329+
doc.add_heading(parent=container_el, label=c_label, text=c_text, prov=c_prov)
330+
else:
331+
doc.add_text(parent=container_el, label=c_label, text=c_text, prov=c_prov)
332+
300333
def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):
301334
"""Convert Document object (with `body`) to its legacy format (with `main-text`)"""
302335

0 commit comments

Comments
 (0)