Skip to content

Commit b31e0a3

Browse files
authored
feat: add legacy to DoclingDocument utility (#108)
* add legacy_to_docling_document Signed-off-by: Michele Dolfi <[email protected]> * fix for python < 3.12 Signed-off-by: Michele Dolfi <[email protected]> * add test Signed-off-by: Michele Dolfi <[email protected]> * add docs with not-preserved components Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
1 parent 047a196 commit b31e0a3

File tree

3 files changed

+6797
-5
lines changed

3 files changed

+6797
-5
lines changed

docling_core/utils/legacy.py

+291-4
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,26 @@
77

88
import hashlib
99
import uuid
10-
from typing import Union
10+
from pathlib import Path
11+
from typing import Dict, Optional, Union
1112

1213
from docling_core.types.doc import (
14+
BoundingBox,
15+
CoordOrigin,
1316
DocItem,
1417
DocItemLabel,
1518
DoclingDocument,
19+
DocumentOrigin,
1620
PictureItem,
21+
ProvenanceItem,
1722
SectionHeaderItem,
23+
Size,
1824
TableCell,
1925
TableItem,
2026
TextItem,
2127
)
22-
from docling_core.types.doc.document import ListItem
28+
from docling_core.types.doc.document import GroupItem, ListItem, TableData
29+
from docling_core.types.doc.labels import GroupLabel
2330
from docling_core.types.legacy_doc.base import (
2431
BaseCell,
2532
BaseText,
@@ -342,5 +349,285 @@ def _make_spans(cell: TableCell, table_item: TableItem):
342349
return legacy_doc
343350

344351

345-
# def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument:
346-
# """Convert a legacy document to DoclingDocument."""
352+
def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # noqa: C901
353+
"""Convert a legacy document to DoclingDocument.
354+
355+
It is known that the following content will not be preserved in the transformation:
356+
- name of labels (upper vs lower case)
357+
- caption of figures are not in main-text anymore
358+
- s3_data removed
359+
- model metadata removed
360+
- logs removed
361+
- document hash cannot be preserved
362+
"""
363+
364+
def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
365+
"""Create a new provenance from a legacy item."""
366+
prov: Optional[ProvenanceItem] = None
367+
if item.prov is not None and len(item.prov) > 0:
368+
prov = ProvenanceItem(
369+
page_no=int(item.prov[0].page),
370+
charspan=tuple(item.prov[0].span),
371+
bbox=BoundingBox.from_tuple(
372+
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
373+
),
374+
)
375+
return prov
376+
377+
origin = DocumentOrigin(
378+
mimetype="application/pdf",
379+
filename=legacy_doc.file_info.filename,
380+
binary_hash=legacy_doc.file_info.document_hash,
381+
)
382+
doc_name = Path(origin.filename).stem
383+
384+
doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin)
385+
386+
# define pages
387+
if legacy_doc.page_dimensions is not None:
388+
for page_dim in legacy_doc.page_dimensions:
389+
page_no = int(page_dim.page)
390+
size = Size(width=page_dim.width, height=page_dim.height)
391+
392+
doc.add_page(page_no=page_no, size=size)
393+
394+
# page headers
395+
if legacy_doc.page_headers is not None:
396+
for text_item in legacy_doc.page_headers:
397+
if text_item.text is None:
398+
continue
399+
prov = _transform_prov(text_item)
400+
doc.add_text(
401+
label=DocItemLabel.PAGE_HEADER,
402+
text=text_item.text,
403+
parent=doc.furniture,
404+
)
405+
406+
# page footers
407+
if legacy_doc.page_footers is not None:
408+
for text_item in legacy_doc.page_footers:
409+
if text_item.text is None:
410+
continue
411+
prov = _transform_prov(text_item)
412+
doc.add_text(
413+
label=DocItemLabel.PAGE_FOOTER,
414+
text=text_item.text,
415+
parent=doc.furniture,
416+
)
417+
418+
# footnotes
419+
if legacy_doc.footnotes is not None:
420+
for text_item in legacy_doc.footnotes:
421+
if text_item.text is None:
422+
continue
423+
prov = _transform_prov(text_item)
424+
doc.add_text(
425+
label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture
426+
)
427+
428+
# main-text content
429+
if legacy_doc.main_text is not None:
430+
item: Optional[Union[BaseCell, BaseText]]
431+
432+
# collect all captions embedded in table and figure objects
433+
# to avoid repeating them
434+
embedded_captions: Dict[str, int] = {}
435+
for ix, orig_item in enumerate(legacy_doc.main_text):
436+
item = (
437+
legacy_doc._resolve_ref(orig_item)
438+
if isinstance(orig_item, Ref)
439+
else orig_item
440+
)
441+
if item is None:
442+
continue
443+
444+
if isinstance(item, (DsSchemaTable, Figure)) and item.text:
445+
embedded_captions[item.text] = ix
446+
447+
# build lookup from floating objects to their caption item
448+
floating_to_caption: Dict[int, BaseText] = {}
449+
for ix, orig_item in enumerate(legacy_doc.main_text):
450+
item = (
451+
legacy_doc._resolve_ref(orig_item)
452+
if isinstance(orig_item, Ref)
453+
else orig_item
454+
)
455+
if item is None:
456+
continue
457+
458+
item_type = item.obj_type.lower()
459+
if (
460+
isinstance(item, BaseText)
461+
and (
462+
item_type == "caption"
463+
or (item.name is not None and item.name.lower() == "caption")
464+
)
465+
and item.text in embedded_captions
466+
):
467+
floating_ix = embedded_captions[item.text]
468+
floating_to_caption[floating_ix] = item
469+
470+
# main loop iteration
471+
current_list: Optional[GroupItem] = None
472+
for ix, orig_item in enumerate(legacy_doc.main_text):
473+
item = (
474+
legacy_doc._resolve_ref(orig_item)
475+
if isinstance(orig_item, Ref)
476+
else orig_item
477+
)
478+
if item is None:
479+
continue
480+
481+
prov = _transform_prov(item)
482+
item_type = item.obj_type.lower()
483+
484+
# if a group is needed, add it
485+
if isinstance(item, BaseText) and (
486+
item_type in "list-item-level-1" or item.name in {"list", "list-item"}
487+
):
488+
if current_list is None:
489+
current_list = doc.add_group(label=GroupLabel.LIST, name="list")
490+
else:
491+
current_list = None
492+
493+
# add the document item in the document
494+
if isinstance(item, BaseText):
495+
text = item.text if item.text is not None else ""
496+
label_name = item.name if item.name is not None else "text"
497+
498+
if item_type == "caption":
499+
if text in embedded_captions:
500+
# skip captions if they are embedded in the actual
501+
# floating objects
502+
continue
503+
else:
504+
# captions without a related object are inserted as text
505+
doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov)
506+
507+
# first title match
508+
if item_type == "title":
509+
doc.add_title(text=text, prov=prov)
510+
511+
# secondary titles
512+
elif item_type in {
513+
"subtitle-level-1",
514+
}:
515+
doc.add_heading(text=text, prov=prov)
516+
517+
# list item
518+
elif item_type in "list-item-level-1" or label_name in {
519+
"list",
520+
"list-item",
521+
}:
522+
# TODO: Infer if this is a numbered or a bullet list item
523+
doc.add_list_item(
524+
text=text, enumerated=False, prov=prov, parent=current_list
525+
)
526+
527+
# normal text
528+
else:
529+
label = DocItemLabel.TEXT
530+
normalized_label_name = label_name.replace("-", "_")
531+
if normalized_label_name is not None:
532+
try:
533+
label = DocItemLabel(normalized_label_name)
534+
except ValueError:
535+
pass
536+
doc.add_text(label=label, text=text, prov=prov)
537+
538+
elif isinstance(item, DsSchemaTable):
539+
540+
table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows)
541+
if item.data is not None:
542+
seen_spans = set()
543+
for row_ix, row in enumerate(item.data):
544+
for col_ix, orig_cell_data in enumerate(row):
545+
546+
cell_bbox: Optional[BoundingBox] = (
547+
BoundingBox.from_tuple(
548+
tuple(orig_cell_data.bbox),
549+
origin=CoordOrigin.BOTTOMLEFT,
550+
)
551+
if orig_cell_data.bbox is not None
552+
else None
553+
)
554+
cell = TableCell(
555+
start_row_offset_idx=row_ix,
556+
end_row_offset_idx=row_ix + 1,
557+
start_col_offset_idx=col_ix,
558+
end_col_offset_idx=col_ix + 1,
559+
text=orig_cell_data.text,
560+
bbox=cell_bbox,
561+
column_header=(orig_cell_data.obj_type == "col_header"),
562+
row_header=(orig_cell_data.obj_type == "row_header"),
563+
row_section=(orig_cell_data.obj_type == "row_section"),
564+
)
565+
566+
if orig_cell_data.spans is not None:
567+
# convert to a tuple of tuples for hashing
568+
spans_tuple = tuple(
569+
tuple(span) for span in orig_cell_data.spans
570+
)
571+
572+
# skip repeated spans
573+
if spans_tuple in seen_spans:
574+
continue
575+
576+
seen_spans.add(spans_tuple)
577+
578+
cell.start_row_offset_idx = min(
579+
s[0] for s in spans_tuple
580+
)
581+
cell.end_row_offset_idx = (
582+
max(s[0] for s in spans_tuple) + 1
583+
)
584+
cell.start_col_offset_idx = min(
585+
s[1] for s in spans_tuple
586+
)
587+
cell.end_col_offset_idx = (
588+
max(s[1] for s in spans_tuple) + 1
589+
)
590+
591+
cell.row_span = (
592+
cell.end_row_offset_idx - cell.start_row_offset_idx
593+
)
594+
cell.col_span = (
595+
cell.end_col_offset_idx - cell.start_col_offset_idx
596+
)
597+
598+
table_data.table_cells.append(cell)
599+
600+
new_item = doc.add_table(data=table_data, prov=prov)
601+
if (caption_item := floating_to_caption.get(ix)) is not None:
602+
if caption_item.text is not None:
603+
caption_prov = _transform_prov(caption_item)
604+
caption = doc.add_text(
605+
label=DocItemLabel.CAPTION,
606+
text=caption_item.text,
607+
prov=caption_prov,
608+
parent=new_item,
609+
)
610+
new_item.captions.append(caption.get_ref())
611+
612+
elif isinstance(item, Figure):
613+
new_item = doc.add_picture(prov=prov)
614+
if (caption_item := floating_to_caption.get(ix)) is not None:
615+
if caption_item.text is not None:
616+
caption_prov = _transform_prov(caption_item)
617+
caption = doc.add_text(
618+
label=DocItemLabel.CAPTION,
619+
text=caption_item.text,
620+
prov=caption_prov,
621+
parent=new_item,
622+
)
623+
new_item.captions.append(caption.get_ref())
624+
625+
# equations
626+
elif (
627+
isinstance(item, BaseCell)
628+
and item.text is not None
629+
and item_type in {"formula", "equation"}
630+
):
631+
doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov)
632+
633+
return doc

0 commit comments

Comments
 (0)