|
| 1 | +import json |
1 | 2 | import re
|
2 | 3 | from pathlib import Path
|
3 | 4 | from typing import List
|
@@ -157,6 +158,7 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
157 | 158 |
|
158 | 159 | pic = doc.add_picture(prov=prov)
|
159 | 160 | pic.captions.extend(caption_refs)
|
| 161 | + _add_child_elements(pic, doc, obj, pelem) |
160 | 162 |
|
161 | 163 | elif ptype == "table":
|
162 | 164 | current_list = None
|
@@ -250,6 +252,13 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
250 | 252 | tbl = doc.add_table(data=tbl_data, prov=prov)
|
251 | 253 | tbl.captions.extend(caption_refs)
|
252 | 254 |
|
| 255 | + elif ptype in ["form", "key_value_region"]: |
| 256 | + |
| 257 | + label = DocItemLabel(ptype) |
| 258 | + container_el = doc.add_group(label=label, name=label) |
| 259 | + |
| 260 | + _add_child_elements(container_el, doc, obj, pelem) |
| 261 | + |
253 | 262 | elif "text" in obj:
|
254 | 263 | text = obj["text"][span_i:span_j]
|
255 | 264 |
|
@@ -297,6 +306,30 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
297 | 306 | return doc
|
298 | 307 |
|
299 | 308 |
|
| 309 | +def _add_child_elements(container_el, doc, obj, pelem): |
| 310 | + children = obj.get("payload", []) |
| 311 | + for child in children: |
| 312 | + c_label = child["label"] |
| 313 | + c_bbox = BoundingBox.model_validate(child["bbox"]) |
| 314 | + c_text = " ".join([ |
| 315 | + cell["text"].replace("\x02", "-").strip() |
| 316 | + for cell in child["cells"] |
| 317 | + if len(cell["text"].strip()) > 0 |
| 318 | + ]) |
| 319 | + |
| 320 | + c_prov = ProvenanceItem( |
| 321 | + page_no=pelem["page"], |
| 322 | + charspan=(0, len(c_text)), |
| 323 | + bbox=c_bbox |
| 324 | + ) |
| 325 | + if c_label == DocItemLabel.LIST_ITEM: |
| 326 | + # TODO: Infer if this is a numbered or a bullet list item |
| 327 | + doc.add_list_item(parent=container_el, label=c_label, text=c_text, prov=c_prov) |
| 328 | + elif c_label == DocItemLabel.SECTION_HEADER: |
| 329 | + doc.add_heading(parent=container_el, label=c_label, text=c_text, prov=c_prov) |
| 330 | + else: |
| 331 | + doc.add_text(parent=container_el, label=c_label, text=c_text, prov=c_prov) |
| 332 | + |
300 | 333 | def to_legacy_document_format(doc_glm, doc_leg={}, update_name_label=False):
|
301 | 334 | """Convert Document object (with `body`) to its legacy format (with `main-text`)"""
|
302 | 335 |
|
|
0 commit comments