Skip to content

Commit

Permalink
Read children payload in nested clusters
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Dec 3, 2024
1 parent 18b60f8 commit 1ab47e1
Showing 1 changed file with 25 additions and 22 deletions.
47 changes: 25 additions & 22 deletions deepsearch_glm/utils/doc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,28 +307,31 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:


def _add_child_elements(container_el, doc, obj, pelem):
children = obj.get("payload", [])
for child in children:
c_label = child["label"]
c_bbox = BoundingBox.model_validate(child["bbox"])
c_text = " ".join([
cell["text"].replace("\x02", "-").strip()
for cell in child["cells"]
if len(cell["text"].strip()) > 0
])

c_prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, len(c_text)),
bbox=c_bbox
)
if c_label == DocItemLabel.LIST_ITEM:
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(parent=container_el, label=c_label, text=c_text, prov=c_prov)
elif c_label == DocItemLabel.SECTION_HEADER:
doc.add_heading(parent=container_el, label=c_label, text=c_text, prov=c_prov)
else:
doc.add_text(parent=container_el, label=c_label, text=c_text, prov=c_prov)
payload = obj.get("payload")
if payload is not None:
children = payload.get("children", [])

for child in children:
c_label = child["label"]
c_bbox = BoundingBox.model_validate(child["bbox"])
c_text = " ".join([
cell["text"].replace("\x02", "-").strip()
for cell in child["cells"]
if len(cell["text"].strip()) > 0
])

c_prov = ProvenanceItem(
page_no=pelem["page"],
charspan=(0, len(c_text)),
bbox=c_bbox
)
if c_label == DocItemLabel.LIST_ITEM:
# TODO: Infer if this is a numbered or a bullet list item
doc.add_list_item(parent=container_el, label=c_label, text=c_text, prov=c_prov)
elif c_label == DocItemLabel.SECTION_HEADER:
doc.add_heading(parent=container_el, label=c_label, text=c_text, prov=c_prov)
else:
doc.add_text(parent=container_el, label=c_label, text=c_text, prov=c_prov)

def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):
"""Convert Document object (with `body`) to its legacy format (with `main-text`)"""
Expand Down

0 comments on commit 1ab47e1

Please sign in to comment.