docling-project · vagenas · Dec 13, 2024 · Dec 12, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -593,6 +593,21 @@ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
 class TextItem(DocItem):
     """TextItem."""
 
+    label: typing.Literal[
+        DocItemLabel.CAPTION,
+        DocItemLabel.CHECKBOX_SELECTED,
+        DocItemLabel.CHECKBOX_UNSELECTED,
+        DocItemLabel.CODE,
+        DocItemLabel.FOOTNOTE,
+        DocItemLabel.FORMULA,
+        DocItemLabel.PAGE_FOOTER,
+        DocItemLabel.PAGE_HEADER,
+        DocItemLabel.PARAGRAPH,
+        DocItemLabel.REFERENCE,
+        DocItemLabel.TEXT,
+        DocItemLabel.TITLE,
+    ]
+
     orig: str  # untreated representation
     text: str  # sanitized representation
 
@@ -644,8 +659,10 @@ def export_to_document_tokens(
 class SectionHeaderItem(TextItem):
     """SectionItem."""
 
-    label: typing.Literal[DocItemLabel.SECTION_HEADER] = DocItemLabel.SECTION_HEADER
-    level: LevelNumber
+    label: typing.Literal[DocItemLabel.SECTION_HEADER] = (
+        DocItemLabel.SECTION_HEADER  # type: ignore[assignment]
+    )
+    level: LevelNumber = 1
 
     def export_to_document_tokens(
         self,
@@ -695,9 +712,11 @@ def export_to_document_tokens(
 class ListItem(TextItem):
     """SectionItem."""
 
-    label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
+    label: typing.Literal[DocItemLabel.LIST_ITEM] = (
+        DocItemLabel.LIST_ITEM  # type: ignore[assignment]
+    )
     enumerated: bool = False
-    marker: str  # The bullet or number symbol that prefixes this list item
+    marker: str = "-"  # The bullet or number symbol that prefixes this list item
 
 
 class FloatingItem(DocItem):
@@ -923,7 +942,10 @@ class TableItem(FloatingItem):
     """TableItem."""
 
     data: TableData
-    label: typing.Literal[DocItemLabel.TABLE] = DocItemLabel.TABLE
+    label: typing.Literal[
+        DocItemLabel.DOCUMENT_INDEX,
+        DocItemLabel.TABLE,
+    ] = DocItemLabel.TABLE
 
     def export_to_dataframe(self) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
@@ -1272,9 +1294,25 @@ def export_to_document_tokens(
 class KeyValueItem(DocItem):
     """KeyValueItem."""
 
+    label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
+
 
-ContentItem = Union[
-    TextItem, SectionHeaderItem, ListItem, PictureItem, TableItem, KeyValueItem
+class FormItem(DocItem):
+    """FormItem."""
+
+    label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM
+
+
+ContentItem = Annotated[
+    Union[
+        TextItem,
+        SectionHeaderItem,
+        ListItem,
+        PictureItem,
+        TableItem,
+        KeyValueItem,
+    ],
+    Field(discriminator="label"),
 ]
 
 
@@ -1505,6 +1543,7 @@ def add_table(
         caption: Optional[Union[TextItem, RefItem]] = None,  # This is not cool yet.
         prov: Optional[ProvenanceItem] = None,
         parent: Optional[GroupItem] = None,
+        label: DocItemLabel = DocItemLabel.TABLE,
     ):
         """add_table.
 
@@ -1522,7 +1561,7 @@ def add_table(
         cref = f"#/tables/{table_index}"
 
         tbl_item = TableItem(
-            label=DocItemLabel.TABLE, data=data, self_ref=cref, parent=parent.get_ref()
+            label=label, data=data, self_ref=cref, parent=parent.get_ref()
         )
         if prov:
             tbl_item.prov.append(prov)

diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json
@@ -171,32 +171,6 @@
       "title": "CoordOrigin",
       "type": "string"
     },
-    "DocItemLabel": {
-      "description": "DocItemLabel.",
-      "enum": [
-        "caption",
-        "footnote",
-        "formula",
-        "list_item",
-        "page_footer",
-        "page_header",
-        "picture",
-        "section_header",
-        "table",
-        "text",
-        "title",
-        "document_index",
-        "code",
-        "checkbox_selected",
-        "checkbox_unselected",
-        "form",
-        "key_value_region",
-        "paragraph",
-        "reference"
-      ],
-      "title": "DocItemLabel",
-      "type": "string"
-    },
     "DocumentOrigin": {
       "description": "FileSource.",
       "properties": {
@@ -362,7 +336,10 @@
           "type": "array"
         },
         "label": {
-          "$ref": "#/$defs/DocItemLabel"
+          "const": "key_value_region",
+          "default": "key_value_region",
+          "title": "Label",
+          "type": "string"
         },
         "prov": {
           "default": [],
@@ -374,8 +351,7 @@
         }
       },
       "required": [
-        "self_ref",
-        "label"
+        "self_ref"
       ],
       "title": "KeyValueItem",
       "type": "object"
@@ -436,15 +412,15 @@
           "type": "boolean"
         },
         "marker": {
+          "default": "-",
           "title": "Marker",
           "type": "string"
         }
       },
       "required": [
         "self_ref",
         "orig",
-        "text",
-        "marker"
+        "text"
       ],
       "title": "ListItem",
       "type": "object"
@@ -1036,6 +1012,7 @@
           "type": "string"
         },
         "level": {
+          "default": 1,
           "maximum": 100,
           "minimum": 1,
           "title": "Level",
@@ -1045,8 +1022,7 @@
       "required": [
         "self_ref",
         "orig",
-        "text",
-        "level"
+        "text"
       ],
       "title": "SectionHeaderItem",
       "type": "object"
@@ -1192,8 +1168,11 @@
           "type": "array"
         },
         "label": {
-          "const": "table",
           "default": "table",
+          "enum": [
+            "document_index",
+            "table"
+          ],
           "title": "Label",
           "type": "string"
         },
@@ -1280,7 +1259,22 @@
           "type": "array"
         },
         "label": {
-          "$ref": "#/$defs/DocItemLabel"
+          "enum": [
+            "caption",
+            "checkbox_selected",
+            "checkbox_unselected",
+            "code",
+            "footnote",
+            "formula",
+            "page_footer",
+            "page_header",
+            "paragraph",
+            "reference",
+            "text",
+            "title"
+          ],
+          "title": "Label",
+          "type": "string"
         },
         "prov": {
           "default": [],