Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 47 additions & 8 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,6 +593,21 @@ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
class TextItem(DocItem):
"""TextItem."""

label: typing.Literal[
DocItemLabel.CAPTION,
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.CHECKBOX_UNSELECTED,
DocItemLabel.CODE,
DocItemLabel.FOOTNOTE,
DocItemLabel.FORMULA,
DocItemLabel.PAGE_FOOTER,
DocItemLabel.PAGE_HEADER,
DocItemLabel.PARAGRAPH,
DocItemLabel.REFERENCE,
DocItemLabel.TEXT,
DocItemLabel.TITLE,
]

orig: str # untreated representation
text: str # sanitized representation

Expand Down Expand Up @@ -644,8 +659,10 @@ def export_to_document_tokens(
class SectionHeaderItem(TextItem):
"""SectionItem."""

label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
level: LevelNumber
label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
DocItemLabel.SECTION_HEADER # type: ignore[assignment]
)
level: LevelNumber = 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume the reason this works with serialization and deserialization, despite setting a level default, is becuase the label is now non-overlapping to the label literals in TextItem? If yes, that's great.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, label is now non-overlapping — and is actually used as the discriminator field in ContentItem further below.


def export_to_document_tokens(
self,
Expand Down Expand Up @@ -695,9 +712,11 @@ def export_to_document_tokens(
class ListItem(TextItem):
"""SectionItem."""

label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
label: typing.Literal[DocItemLabel.LIST_ITEM] = (
DocItemLabel.LIST_ITEM # type: ignore[assignment]
)
enumerated: bool = False
marker: str # The bullet or number symbol that prefixes this list item
marker: str = "-" # The bullet or number symbol that prefixes this list item


class FloatingItem(DocItem):
Expand Down Expand Up @@ -923,7 +942,10 @@ class TableItem(FloatingItem):
"""TableItem."""

data: TableData
label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
label: typing.Literal[
DocItemLabel.DOCUMENT_INDEX,
DocItemLabel.TABLE,
] = DocItemLabel.TABLE

def export_to_dataframe(self) -> pd.DataFrame:
"""Export the table as a Pandas DataFrame."""
Expand Down Expand Up @@ -1272,9 +1294,25 @@ def export_to_document_tokens(
class KeyValueItem(DocItem):
"""KeyValueItem."""

label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION


ContentItem = Union[
TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
class FormItem(DocItem):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure we need a FormItem at this point. We can delay putting this in up until we will use it.
The changes for the layout processing in docling-project/docling#530 currently put simply a GroupItem for Forms and Key-Value-Regions, which act purely as groups without special semantics.

"""FormItem."""

label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM


ContentItem = Annotated[
Union[
TextItem,
SectionHeaderItem,
ListItem,
PictureItem,
TableItem,
KeyValueItem,
],
Field(discriminator="label"),
]


Expand Down Expand Up @@ -1505,6 +1543,7 @@ def add_table(
caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
prov: Optional[ProvenanceItem] = None,
parent: Optional[GroupItem] = None,
label: DocItemLabel = DocItemLabel.TABLE,
):
"""add_table.

Expand All @@ -1522,7 +1561,7 @@ def add_table(
cref = f"#/tables/{table_index}"

tbl_item = TableItem(
label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
label=label, data=data, self_ref=cref, parent=parent.get_ref()
)
if prov:
tbl_item.prov.append(prov)
Expand Down
64 changes: 29 additions & 35 deletions docs/DoclingDocument.json
Original file line number Diff line number Diff line change
Expand Up @@ -171,32 +171,6 @@
"title": "CoordOrigin",
"type": "string"
},
"DocItemLabel": {
"description": "DocItemLabel.",
"enum": [
"caption",
"footnote",
"formula",
"list_item",
"page_footer",
"page_header",
"picture",
"section_header",
"table",
"text",
"title",
"document_index",
"code",
"checkbox_selected",
"checkbox_unselected",
"form",
"key_value_region",
"paragraph",
"reference"
],
"title": "DocItemLabel",
"type": "string"
},
"DocumentOrigin": {
"description": "FileSource.",
"properties": {
Expand Down Expand Up @@ -362,7 +336,10 @@
"type": "array"
},
"label": {
"$ref": "#/$defs/DocItemLabel"
"const": "key_value_region",
"default": "key_value_region",
"title": "Label",
"type": "string"
},
"prov": {
"default": [],
Expand All @@ -374,8 +351,7 @@
}
},
"required": [
"self_ref",
"label"
"self_ref"
],
"title": "KeyValueItem",
"type": "object"
Expand Down Expand Up @@ -436,15 +412,15 @@
"type": "boolean"
},
"marker": {
"default": "-",
"title": "Marker",
"type": "string"
}
},
"required": [
"self_ref",
"orig",
"text",
"marker"
"text"
],
"title": "ListItem",
"type": "object"
Expand Down Expand Up @@ -1036,6 +1012,7 @@
"type": "string"
},
"level": {
"default": 1,
"maximum": 100,
"minimum": 1,
"title": "Level",
Expand All @@ -1045,8 +1022,7 @@
"required": [
"self_ref",
"orig",
"text",
"level"
"text"
],
"title": "SectionHeaderItem",
"type": "object"
Expand Down Expand Up @@ -1192,8 +1168,11 @@
"type": "array"
},
"label": {
"const": "table",
"default": "table",
"enum": [
"document_index",
"table"
],
"title": "Label",
"type": "string"
},
Expand Down Expand Up @@ -1280,7 +1259,22 @@
"type": "array"
},
"label": {
"$ref": "#/$defs/DocItemLabel"
"enum": [
"caption",
"checkbox_selected",
"checkbox_unselected",
"code",
"footnote",
"formula",
"page_footer",
"page_header",
"paragraph",
"reference",
"text",
"title"
],
"title": "Label",
"type": "string"
},
"prov": {
"default": [],
Expand Down
Loading
Loading