|
7 | 7 |
|
8 | 8 | import hashlib
|
9 | 9 | import uuid
|
10 |
| -from typing import Union |
| 10 | +from pathlib import Path |
| 11 | +from typing import Dict, Optional, Union |
11 | 12 |
|
12 | 13 | from docling_core.types.doc import (
|
| 14 | + BoundingBox, |
| 15 | + CoordOrigin, |
13 | 16 | DocItem,
|
14 | 17 | DocItemLabel,
|
15 | 18 | DoclingDocument,
|
| 19 | + DocumentOrigin, |
16 | 20 | PictureItem,
|
| 21 | + ProvenanceItem, |
17 | 22 | SectionHeaderItem,
|
| 23 | + Size, |
18 | 24 | TableCell,
|
19 | 25 | TableItem,
|
20 | 26 | TextItem,
|
21 | 27 | )
|
22 |
| -from docling_core.types.doc.document import ListItem |
| 28 | +from docling_core.types.doc.document import GroupItem, ListItem, TableData |
| 29 | +from docling_core.types.doc.labels import GroupLabel |
23 | 30 | from docling_core.types.legacy_doc.base import (
|
24 | 31 | BaseCell,
|
25 | 32 | BaseText,
|
@@ -342,5 +349,285 @@ def _make_spans(cell: TableCell, table_item: TableItem):
|
342 | 349 | return legacy_doc
|
343 | 350 |
|
344 | 351 |
|
345 |
| -# def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: |
346 |
| -# """Convert a legacy document to DoclingDocument.""" |
| 352 | +def legacy_to_docling_document(legacy_doc: DsDocument) -> DoclingDocument: # noqa: C901 |
| 353 | + """Convert a legacy document to DoclingDocument. |
| 354 | +
|
| 355 | + It is known that the following content will not be preserved in the transformation: |
| 356 | + - name of labels (upper vs lower case) |
| 357 | + - caption of figures are not in main-text anymore |
| 358 | + - s3_data removed |
| 359 | + - model metadata removed |
| 360 | + - logs removed |
| 361 | + - document hash cannot be preserved |
| 362 | + """ |
| 363 | + |
| 364 | + def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]: |
| 365 | + """Create a new provenance from a legacy item.""" |
| 366 | + prov: Optional[ProvenanceItem] = None |
| 367 | + if item.prov is not None and len(item.prov) > 0: |
| 368 | + prov = ProvenanceItem( |
| 369 | + page_no=int(item.prov[0].page), |
| 370 | + charspan=tuple(item.prov[0].span), |
| 371 | + bbox=BoundingBox.from_tuple( |
| 372 | + tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT |
| 373 | + ), |
| 374 | + ) |
| 375 | + return prov |
| 376 | + |
| 377 | + origin = DocumentOrigin( |
| 378 | + mimetype="application/pdf", |
| 379 | + filename=legacy_doc.file_info.filename, |
| 380 | + binary_hash=legacy_doc.file_info.document_hash, |
| 381 | + ) |
| 382 | + doc_name = Path(origin.filename).stem |
| 383 | + |
| 384 | + doc: DoclingDocument = DoclingDocument(name=doc_name, origin=origin) |
| 385 | + |
| 386 | + # define pages |
| 387 | + if legacy_doc.page_dimensions is not None: |
| 388 | + for page_dim in legacy_doc.page_dimensions: |
| 389 | + page_no = int(page_dim.page) |
| 390 | + size = Size(width=page_dim.width, height=page_dim.height) |
| 391 | + |
| 392 | + doc.add_page(page_no=page_no, size=size) |
| 393 | + |
| 394 | + # page headers |
| 395 | + if legacy_doc.page_headers is not None: |
| 396 | + for text_item in legacy_doc.page_headers: |
| 397 | + if text_item.text is None: |
| 398 | + continue |
| 399 | + prov = _transform_prov(text_item) |
| 400 | + doc.add_text( |
| 401 | + label=DocItemLabel.PAGE_HEADER, |
| 402 | + text=text_item.text, |
| 403 | + parent=doc.furniture, |
| 404 | + ) |
| 405 | + |
| 406 | + # page footers |
| 407 | + if legacy_doc.page_footers is not None: |
| 408 | + for text_item in legacy_doc.page_footers: |
| 409 | + if text_item.text is None: |
| 410 | + continue |
| 411 | + prov = _transform_prov(text_item) |
| 412 | + doc.add_text( |
| 413 | + label=DocItemLabel.PAGE_FOOTER, |
| 414 | + text=text_item.text, |
| 415 | + parent=doc.furniture, |
| 416 | + ) |
| 417 | + |
| 418 | + # footnotes |
| 419 | + if legacy_doc.footnotes is not None: |
| 420 | + for text_item in legacy_doc.footnotes: |
| 421 | + if text_item.text is None: |
| 422 | + continue |
| 423 | + prov = _transform_prov(text_item) |
| 424 | + doc.add_text( |
| 425 | + label=DocItemLabel.FOOTNOTE, text=text_item.text, parent=doc.furniture |
| 426 | + ) |
| 427 | + |
| 428 | + # main-text content |
| 429 | + if legacy_doc.main_text is not None: |
| 430 | + item: Optional[Union[BaseCell, BaseText]] |
| 431 | + |
| 432 | + # collect all captions embedded in table and figure objects |
| 433 | + # to avoid repeating them |
| 434 | + embedded_captions: Dict[str, int] = {} |
| 435 | + for ix, orig_item in enumerate(legacy_doc.main_text): |
| 436 | + item = ( |
| 437 | + legacy_doc._resolve_ref(orig_item) |
| 438 | + if isinstance(orig_item, Ref) |
| 439 | + else orig_item |
| 440 | + ) |
| 441 | + if item is None: |
| 442 | + continue |
| 443 | + |
| 444 | + if isinstance(item, (DsSchemaTable, Figure)) and item.text: |
| 445 | + embedded_captions[item.text] = ix |
| 446 | + |
| 447 | + # build lookup from floating objects to their caption item |
| 448 | + floating_to_caption: Dict[int, BaseText] = {} |
| 449 | + for ix, orig_item in enumerate(legacy_doc.main_text): |
| 450 | + item = ( |
| 451 | + legacy_doc._resolve_ref(orig_item) |
| 452 | + if isinstance(orig_item, Ref) |
| 453 | + else orig_item |
| 454 | + ) |
| 455 | + if item is None: |
| 456 | + continue |
| 457 | + |
| 458 | + item_type = item.obj_type.lower() |
| 459 | + if ( |
| 460 | + isinstance(item, BaseText) |
| 461 | + and ( |
| 462 | + item_type == "caption" |
| 463 | + or (item.name is not None and item.name.lower() == "caption") |
| 464 | + ) |
| 465 | + and item.text in embedded_captions |
| 466 | + ): |
| 467 | + floating_ix = embedded_captions[item.text] |
| 468 | + floating_to_caption[floating_ix] = item |
| 469 | + |
| 470 | + # main loop iteration |
| 471 | + current_list: Optional[GroupItem] = None |
| 472 | + for ix, orig_item in enumerate(legacy_doc.main_text): |
| 473 | + item = ( |
| 474 | + legacy_doc._resolve_ref(orig_item) |
| 475 | + if isinstance(orig_item, Ref) |
| 476 | + else orig_item |
| 477 | + ) |
| 478 | + if item is None: |
| 479 | + continue |
| 480 | + |
| 481 | + prov = _transform_prov(item) |
| 482 | + item_type = item.obj_type.lower() |
| 483 | + |
| 484 | + # if a group is needed, add it |
| 485 | + if isinstance(item, BaseText) and ( |
| 486 | + item_type in "list-item-level-1" or item.name in {"list", "list-item"} |
| 487 | + ): |
| 488 | + if current_list is None: |
| 489 | + current_list = doc.add_group(label=GroupLabel.LIST, name="list") |
| 490 | + else: |
| 491 | + current_list = None |
| 492 | + |
| 493 | + # add the document item in the document |
| 494 | + if isinstance(item, BaseText): |
| 495 | + text = item.text if item.text is not None else "" |
| 496 | + label_name = item.name if item.name is not None else "text" |
| 497 | + |
| 498 | + if item_type == "caption": |
| 499 | + if text in embedded_captions: |
| 500 | + # skip captions if they are embedded in the actual |
| 501 | + # floating objects |
| 502 | + continue |
| 503 | + else: |
| 504 | + # captions without a related object are inserted as text |
| 505 | + doc.add_text(label=DocItemLabel.TEXT, text=text, prov=prov) |
| 506 | + |
| 507 | + # first title match |
| 508 | + if item_type == "title": |
| 509 | + doc.add_title(text=text, prov=prov) |
| 510 | + |
| 511 | + # secondary titles |
| 512 | + elif item_type in { |
| 513 | + "subtitle-level-1", |
| 514 | + }: |
| 515 | + doc.add_heading(text=text, prov=prov) |
| 516 | + |
| 517 | + # list item |
| 518 | + elif item_type in "list-item-level-1" or label_name in { |
| 519 | + "list", |
| 520 | + "list-item", |
| 521 | + }: |
| 522 | + # TODO: Infer if this is a numbered or a bullet list item |
| 523 | + doc.add_list_item( |
| 524 | + text=text, enumerated=False, prov=prov, parent=current_list |
| 525 | + ) |
| 526 | + |
| 527 | + # normal text |
| 528 | + else: |
| 529 | + label = DocItemLabel.TEXT |
| 530 | + normalized_label_name = label_name.replace("-", "_") |
| 531 | + if normalized_label_name is not None: |
| 532 | + try: |
| 533 | + label = DocItemLabel(normalized_label_name) |
| 534 | + except ValueError: |
| 535 | + pass |
| 536 | + doc.add_text(label=label, text=text, prov=prov) |
| 537 | + |
| 538 | + elif isinstance(item, DsSchemaTable): |
| 539 | + |
| 540 | + table_data = TableData(num_cols=item.num_cols, num_rows=item.num_rows) |
| 541 | + if item.data is not None: |
| 542 | + seen_spans = set() |
| 543 | + for row_ix, row in enumerate(item.data): |
| 544 | + for col_ix, orig_cell_data in enumerate(row): |
| 545 | + |
| 546 | + cell_bbox: Optional[BoundingBox] = ( |
| 547 | + BoundingBox.from_tuple( |
| 548 | + tuple(orig_cell_data.bbox), |
| 549 | + origin=CoordOrigin.BOTTOMLEFT, |
| 550 | + ) |
| 551 | + if orig_cell_data.bbox is not None |
| 552 | + else None |
| 553 | + ) |
| 554 | + cell = TableCell( |
| 555 | + start_row_offset_idx=row_ix, |
| 556 | + end_row_offset_idx=row_ix + 1, |
| 557 | + start_col_offset_idx=col_ix, |
| 558 | + end_col_offset_idx=col_ix + 1, |
| 559 | + text=orig_cell_data.text, |
| 560 | + bbox=cell_bbox, |
| 561 | + column_header=(orig_cell_data.obj_type == "col_header"), |
| 562 | + row_header=(orig_cell_data.obj_type == "row_header"), |
| 563 | + row_section=(orig_cell_data.obj_type == "row_section"), |
| 564 | + ) |
| 565 | + |
| 566 | + if orig_cell_data.spans is not None: |
| 567 | + # convert to a tuple of tuples for hashing |
| 568 | + spans_tuple = tuple( |
| 569 | + tuple(span) for span in orig_cell_data.spans |
| 570 | + ) |
| 571 | + |
| 572 | + # skip repeated spans |
| 573 | + if spans_tuple in seen_spans: |
| 574 | + continue |
| 575 | + |
| 576 | + seen_spans.add(spans_tuple) |
| 577 | + |
| 578 | + cell.start_row_offset_idx = min( |
| 579 | + s[0] for s in spans_tuple |
| 580 | + ) |
| 581 | + cell.end_row_offset_idx = ( |
| 582 | + max(s[0] for s in spans_tuple) + 1 |
| 583 | + ) |
| 584 | + cell.start_col_offset_idx = min( |
| 585 | + s[1] for s in spans_tuple |
| 586 | + ) |
| 587 | + cell.end_col_offset_idx = ( |
| 588 | + max(s[1] for s in spans_tuple) + 1 |
| 589 | + ) |
| 590 | + |
| 591 | + cell.row_span = ( |
| 592 | + cell.end_row_offset_idx - cell.start_row_offset_idx |
| 593 | + ) |
| 594 | + cell.col_span = ( |
| 595 | + cell.end_col_offset_idx - cell.start_col_offset_idx |
| 596 | + ) |
| 597 | + |
| 598 | + table_data.table_cells.append(cell) |
| 599 | + |
| 600 | + new_item = doc.add_table(data=table_data, prov=prov) |
| 601 | + if (caption_item := floating_to_caption.get(ix)) is not None: |
| 602 | + if caption_item.text is not None: |
| 603 | + caption_prov = _transform_prov(caption_item) |
| 604 | + caption = doc.add_text( |
| 605 | + label=DocItemLabel.CAPTION, |
| 606 | + text=caption_item.text, |
| 607 | + prov=caption_prov, |
| 608 | + parent=new_item, |
| 609 | + ) |
| 610 | + new_item.captions.append(caption.get_ref()) |
| 611 | + |
| 612 | + elif isinstance(item, Figure): |
| 613 | + new_item = doc.add_picture(prov=prov) |
| 614 | + if (caption_item := floating_to_caption.get(ix)) is not None: |
| 615 | + if caption_item.text is not None: |
| 616 | + caption_prov = _transform_prov(caption_item) |
| 617 | + caption = doc.add_text( |
| 618 | + label=DocItemLabel.CAPTION, |
| 619 | + text=caption_item.text, |
| 620 | + prov=caption_prov, |
| 621 | + parent=new_item, |
| 622 | + ) |
| 623 | + new_item.captions.append(caption.get_ref()) |
| 624 | + |
| 625 | + # equations |
| 626 | + elif ( |
| 627 | + isinstance(item, BaseCell) |
| 628 | + and item.text is not None |
| 629 | + and item_type in {"formula", "equation"} |
| 630 | + ): |
| 631 | + doc.add_text(label=DocItemLabel.FORMULA, text=item.text, prov=prov) |
| 632 | + |
| 633 | + return doc |
0 commit comments