From d2b1e60d79d961c53889016eff06536a97ec31ee Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Mon, 21 Oct 2024 15:54:59 +0200 Subject: [PATCH] feat: Disambiguate section headings and list items from text items in DoclingDocument (#86) * Disambiguate section headings and list items from text items Signed-off-by: Christoph Auer * Group list items Signed-off-by: Christoph Auer --------- Signed-off-by: Christoph Auer --- deepsearch_glm/utils/doc_utils.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/deepsearch_glm/utils/doc_utils.py b/deepsearch_glm/utils/doc_utils.py index 6794895d..f6a553e7 100644 --- a/deepsearch_glm/utils/doc_utils.py +++ b/deepsearch_glm/utils/doc_utils.py @@ -9,6 +9,7 @@ DocItemLabel, DoclingDocument, DocumentOrigin, + GroupLabel, ProvenanceItem, Size, TableCell, @@ -82,6 +83,8 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: else: props = pd.DataFrame() + current_list = None + for ix, pelem in enumerate(doc_glm["page-elements"]): ptype = pelem["type"] span_i = pelem["span"][0] @@ -105,10 +108,12 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: obj = resolve_item(path, doc_glm) if obj is None: + current_list = None print(f"warning: undefined {path}") continue if ptype == "figure": + current_list = None text = "" caption_refs = [] for caption in obj["captions"]: @@ -154,6 +159,7 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: pic.captions.extend(caption_refs) elif ptype == "table": + current_list = None text = "" caption_refs = [] for caption in obj["captions"]: @@ -263,13 +269,24 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT ), ) + label = DocItemLabel(name_label) - # TODO: Decide on add_heading, add_list_item, or add_text according to label. - doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov) + if label == DocItemLabel.LIST_ITEM: + if current_list is None: + current_list = doc.add_group(label=GroupLabel.LIST, name="list") - else: - pass - # This branch should not be reachable. + # TODO: Infer if this is a numbered or a bullet list item + doc.add_list_item( + text=text, enumerated=False, prov=prov, parent=current_list + ) + elif label == DocItemLabel.SECTION_HEADER: + current_list = None + + doc.add_heading(text=text, prov=prov) + else: + current_list = None + + doc.add_text(label=DocItemLabel(name_label), text=text, prov=prov) for page_dim in doc_glm["page-dimensions"]: page_no = int(page_dim["page"])