Skip to content

Commit 047a196

Browse files
authored
fix: improve doc item typing (#105)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 9628d19 commit 047a196

File tree

6 files changed

+129
-133
lines changed

6 files changed

+129
-133
lines changed

docling_core/types/doc/document.py

+45-12
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,21 @@ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
593593
class TextItem(DocItem):
594594
"""TextItem."""
595595

596+
label: typing.Literal[
597+
DocItemLabel.CAPTION,
598+
DocItemLabel.CHECKBOX_SELECTED,
599+
DocItemLabel.CHECKBOX_UNSELECTED,
600+
DocItemLabel.CODE,
601+
DocItemLabel.FOOTNOTE,
602+
DocItemLabel.FORMULA,
603+
DocItemLabel.PAGE_FOOTER,
604+
DocItemLabel.PAGE_HEADER,
605+
DocItemLabel.PARAGRAPH,
606+
DocItemLabel.REFERENCE,
607+
DocItemLabel.TEXT,
608+
DocItemLabel.TITLE,
609+
]
610+
596611
orig: str # untreated representation
597612
text: str # sanitized representation
598613

@@ -644,8 +659,10 @@ def export_to_document_tokens(
644659
class SectionHeaderItem(TextItem):
645660
"""SectionItem."""
646661

647-
label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
648-
level: LevelNumber
662+
label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
663+
DocItemLabel.SECTION_HEADER # type: ignore[assignment]
664+
)
665+
level: LevelNumber = 1
649666

650667
def export_to_document_tokens(
651668
self,
@@ -695,9 +712,11 @@ def export_to_document_tokens(
695712
class ListItem(TextItem):
696713
"""SectionItem."""
697714

698-
label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
715+
label: typing.Literal[DocItemLabel.LIST_ITEM] = (
716+
DocItemLabel.LIST_ITEM # type: ignore[assignment]
717+
)
699718
enumerated: bool = False
700-
marker: str # The bullet or number symbol that prefixes this list item
719+
marker: str = "-" # The bullet or number symbol that prefixes this list item
701720

702721

703722
class FloatingItem(DocItem):
@@ -923,7 +942,10 @@ class TableItem(FloatingItem):
923942
"""TableItem."""
924943

925944
data: TableData
926-
label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
945+
label: typing.Literal[
946+
DocItemLabel.DOCUMENT_INDEX,
947+
DocItemLabel.TABLE,
948+
] = DocItemLabel.TABLE
927949

928950
def export_to_dataframe(self) -> pd.DataFrame:
929951
"""Export the table as a Pandas DataFrame."""
@@ -1272,9 +1294,19 @@ def export_to_document_tokens(
12721294
class KeyValueItem(DocItem):
12731295
"""KeyValueItem."""
12741296

1297+
label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
1298+
12751299

1276-
ContentItem = Union[
1277-
TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
1300+
ContentItem = Annotated[
1301+
Union[
1302+
TextItem,
1303+
SectionHeaderItem,
1304+
ListItem,
1305+
PictureItem,
1306+
TableItem,
1307+
KeyValueItem,
1308+
],
1309+
Field(discriminator="label"),
12781310
]
12791311

12801312

@@ -1505,14 +1537,15 @@ def add_table(
15051537
caption: Optional[Union[TextItem, RefItem]] = None, # This is not cool yet.
15061538
prov: Optional[ProvenanceItem] = None,
15071539
parent: Optional[GroupItem] = None,
1540+
label: DocItemLabel = DocItemLabel.TABLE,
15081541
):
15091542
"""add_table.
15101543
1511-
:param data: BaseTableData:
1512-
:param caption: Optional[Union[TextItem:
1513-
:param RefItem]]: (Default value = None)
1514-
:param # This is not cool yet.prov: Optional[ProvenanceItem]
1544+
:param data: TableData:
1545+
:param caption: Optional[Union[TextItem, RefItem]]: (Default value = None)
1546+
:param prov: Optional[ProvenanceItem]: (Default value = None)
15151547
:param parent: Optional[GroupItem]: (Default value = None)
1548+
:param label: DocItemLabel: (Default value = DocItemLabel.TABLE)
15161549
15171550
"""
15181551
if not parent:
@@ -1522,7 +1555,7 @@ def add_table(
15221555
cref = f"#/tables/{table_index}"
15231556

15241557
tbl_item = TableItem(
1525-
label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
1558+
label=label, data=data, self_ref=cref, parent=parent.get_ref()
15261559
)
15271560
if prov:
15281561
tbl_item.prov.append(prov)

docs/DoclingDocument.json

+29-35
Original file line numberDiff line numberDiff line change
@@ -171,32 +171,6 @@
171171
"title": "CoordOrigin",
172172
"type": "string"
173173
},
174-
"DocItemLabel": {
175-
"description": "DocItemLabel.",
176-
"enum": [
177-
"caption",
178-
"footnote",
179-
"formula",
180-
"list_item",
181-
"page_footer",
182-
"page_header",
183-
"picture",
184-
"section_header",
185-
"table",
186-
"text",
187-
"title",
188-
"document_index",
189-
"code",
190-
"checkbox_selected",
191-
"checkbox_unselected",
192-
"form",
193-
"key_value_region",
194-
"paragraph",
195-
"reference"
196-
],
197-
"title": "DocItemLabel",
198-
"type": "string"
199-
},
200174
"DocumentOrigin": {
201175
"description": "FileSource.",
202176
"properties": {
@@ -362,7 +336,10 @@
362336
"type": "array"
363337
},
364338
"label": {
365-
"$ref": "#/$defs/DocItemLabel"
339+
"const": "key_value_region",
340+
"default": "key_value_region",
341+
"title": "Label",
342+
"type": "string"
366343
},
367344
"prov": {
368345
"default": [],
@@ -374,8 +351,7 @@
374351
}
375352
},
376353
"required": [
377-
"self_ref",
378-
"label"
354+
"self_ref"
379355
],
380356
"title": "KeyValueItem",
381357
"type": "object"
@@ -436,15 +412,15 @@
436412
"type": "boolean"
437413
},
438414
"marker": {
415+
"default": "-",
439416
"title": "Marker",
440417
"type": "string"
441418
}
442419
},
443420
"required": [
444421
"self_ref",
445422
"orig",
446-
"text",
447-
"marker"
423+
"text"
448424
],
449425
"title": "ListItem",
450426
"type": "object"
@@ -1036,6 +1012,7 @@
10361012
"type": "string"
10371013
},
10381014
"level": {
1015+
"default": 1,
10391016
"maximum": 100,
10401017
"minimum": 1,
10411018
"title": "Level",
@@ -1045,8 +1022,7 @@
10451022
"required": [
10461023
"self_ref",
10471024
"orig",
1048-
"text",
1049-
"level"
1025+
"text"
10501026
],
10511027
"title": "SectionHeaderItem",
10521028
"type": "object"
@@ -1192,8 +1168,11 @@
11921168
"type": "array"
11931169
},
11941170
"label": {
1195-
"const": "table",
11961171
"default": "table",
1172+
"enum": [
1173+
"document_index",
1174+
"table"
1175+
],
11971176
"title": "Label",
11981177
"type": "string"
11991178
},
@@ -1280,7 +1259,22 @@
12801259
"type": "array"
12811260
},
12821261
"label": {
1283-
"$ref": "#/$defs/DocItemLabel"
1262+
"enum": [
1263+
"caption",
1264+
"checkbox_selected",
1265+
"checkbox_unselected",
1266+
"code",
1267+
"footnote",
1268+
"formula",
1269+
"page_footer",
1270+
"page_header",
1271+
"paragraph",
1272+
"reference",
1273+
"text",
1274+
"title"
1275+
],
1276+
"title": "Label",
1277+
"type": "string"
12841278
},
12851279
"prov": {
12861280
"default": [],

0 commit comments

Comments
 (0)