From 334e1b7f212fdac84b10fea54140127ddf90ae4c Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 17 Sep 2024 15:13:32 +0200
Subject: [PATCH] Draft new docling document format, pydantic model and tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling_core/types/newdoc/__init__.py |   0
 docling_core/types/newdoc/base.py     | 119 ++++++++++++++++++
 docling_core/types/newdoc/document.py |  82 +++++++++++++
 test/data/newdoc/dummy_doc.yaml       | 169 ++++++++++++++++++++++++++
 test/test_newdoc.py                   |  33 +++++
 5 files changed, 403 insertions(+)
 create mode 100644 docling_core/types/newdoc/__init__.py
 create mode 100644 docling_core/types/newdoc/base.py
 create mode 100644 docling_core/types/newdoc/document.py
 create mode 100644 test/data/newdoc/dummy_doc.yaml
 create mode 100644 test/test_newdoc.py

diff --git a/docling_core/types/newdoc/__init__.py b/docling_core/types/newdoc/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/docling_core/types/newdoc/base.py b/docling_core/types/newdoc/base.py
new file mode 100644
index 00000000..c30dfd04
--- /dev/null
+++ b/docling_core/types/newdoc/base.py
@@ -0,0 +1,119 @@
+import copy
+import enum
+from typing import Tuple
+
+from pydantic import BaseModel
+
+
+## All copied from docling
+class CoordOrigin(enum.StrEnum):
+    TOPLEFT = "TOPLEFT"
+    BOTTOMLEFT = "BOTTOMLEFT"
+
+class Size(BaseModel):
+    width: float = 0.0
+    height: float = 0.0
+
+
+class BoundingBox(BaseModel):
+    l: float  # left
+    t: float  # top
+    r: float  # right
+    b: float  # bottom
+
+    coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
+
+    @property
+    def width(self):
+        return self.r - self.l
+
+    @property
+    def height(self):
+        return abs(self.t - self.b)
+
+    def scaled(self, scale: float) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l *= scale
+        out_bbox.r *= scale
+        out_bbox.t *= scale
+        out_bbox.b *= scale
+
+        return out_bbox
+
+    def normalized(self, page_size: Size) -> "BoundingBox":
+        out_bbox = copy.deepcopy(self)
+        out_bbox.l /= page_size.width
+        out_bbox.r /= page_size.width
+        out_bbox.t /= page_size.height
+        out_bbox.b /= page_size.height
+
+        return out_bbox
+
+    def as_tuple(self):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return (self.l, self.t, self.r, self.b)
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return (self.l, self.b, self.r, self.t)
+
+    @classmethod
+    def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
+        if origin == CoordOrigin.TOPLEFT:
+            l, t, r, b = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b < t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+        elif origin == CoordOrigin.BOTTOMLEFT:
+            l, b, r, t = coord[0], coord[1], coord[2], coord[3]
+            if r < l:
+                l, r = r, l
+            if b > t:
+                b, t = t, b
+
+            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
+
+    def area(self) -> float:
+        return (self.r - self.l) * (self.b - self.t)
+
+    def intersection_area_with(self, other: "BoundingBox") -> float:
+        # Calculate intersection coordinates
+        left = max(self.l, other.l)
+        top = max(self.t, other.t)
+        right = min(self.r, other.r)
+        bottom = min(self.b, other.b)
+
+        # Calculate intersection dimensions
+        width = right - left
+        height = bottom - top
+
+        # If the bounding boxes do not overlap, width or height will be negative
+        if width <= 0 or height <= 0:
+            return 0.0
+
+        return width * height
+
+    def to_bottom_left_origin(self, page_height) -> "BoundingBox":
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.TOPLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,
+                b=page_height - self.b,
+                coord_origin=CoordOrigin.BOTTOMLEFT,
+            )
+
+    def to_top_left_origin(self, page_height):
+        if self.coord_origin == CoordOrigin.TOPLEFT:
+            return self
+        elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            return BoundingBox(
+                l=self.l,
+                r=self.r,
+                t=page_height - self.t,  # self.b
+                b=page_height - self.b,  # self.t
+                coord_origin=CoordOrigin.TOPLEFT,
+            )
diff --git a/docling_core/types/newdoc/document.py b/docling_core/types/newdoc/document.py
new file mode 100644
index 00000000..dbcdc714
--- /dev/null
+++ b/docling_core/types/newdoc/document.py
@@ -0,0 +1,82 @@
+from typing import Any, List, Dict, Optional, Tuple, Union
+
+from pydantic import BaseModel, Field, AnyUrl
+
+from docling_core.types.newdoc.base import Size, BoundingBox
+
+class FigureData(BaseModel): # TBD
+    pass
+
+class TableData(BaseModel): # TBD
+    pass
+
+class RefItem(BaseModel):
+    cref: str = Field(alias="$ref")
+
+    def resolve(self, doc: "DoclingDocument"):
+        _, path, index = self.cref.split("/")
+        index = int(index)
+        obj = doc.__getattribute__(path)[index]
+        return obj
+
+class ImageRef(BaseModel):
+    format: str # png, etc.
+    dpi: int # ...
+    size: Size
+    uri: AnyUrl
+
+
+class ProvenanceItem(BaseModel):
+    page_no: int
+    bbox: BoundingBox
+    charspan: Tuple[int, int]
+
+
+class DocItem(BaseModel):
+    dloc: str # format spec ({document_hash}{json-path})
+    hash: int
+    label: str
+    parent: Optional[RefItem]
+    children: List[RefItem]
+    prov: List[ProvenanceItem]
+
+class TextItem(DocItem):
+    orig: str # untreated representation
+    text: str # sanitized representation
+
+class FloatingItem(DocItem):
+    caption: Optional[Union[RefItem, TextItem]]
+    references: List[Union[RefItem, TextItem]]
+    footnotes: List[Union[RefItem, TextItem]]
+    data: Any
+    image: Optional[ImageRef]
+
+class FigureItem(DocItem):
+    data: FigureData
+
+class TableItem(DocItem):
+    data: TableData
+
+class KeyValueItem(DocItem):
+    pass
+
+ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
+
+class DocumentContent(BaseModel):
+    furniture: List[RefItem] = []
+    body: List[RefItem] = []
+    texts: List[TextItem] = []
+    figures: List[FigureItem] = []
+    tables: List[TableItem] = []
+    key_value_items: List[KeyValueItem] = []
+
+class PageItem(DocumentContent):
+    hash: str # page hash
+    size: Size
+    image: Optional[ImageRef]
+    num_elements: int
+
+class DoclingDocument(DocumentContent):
+    description: Any
+    file_info: Any
+    pages: Dict[int, PageItem] = {} # empty as default
diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/newdoc/dummy_doc.yaml
new file mode 100644
index 00000000..f092eb81
--- /dev/null
+++ b/test/data/newdoc/dummy_doc.yaml
@@ -0,0 +1,169 @@
+---
+## Document with content + layout info
+description: { } # DescriptionType - TBD
+file_info: # FileInfoType - TBD
+  document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
+furniture: # Headers, footers, framing, navigation elements, all other non-body text
+  - $ref: "/texts/0"
+
+body: # All elements in other arrays, by-reference only
+  - $ref: "/texts/1"
+  - $ref: "/figure/0"
+  - $ref: "/texts/2"
+  - $ref: "/texts/3"
+  - $ref: "/tables/0"
+
+texts: # All elements that have a text-string representation, with actual data
+  - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
+    text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
+    hash: 132103230
+    label: "page_header"
+    parent: null
+    children: [ ]
+    prov:
+      - page_no: 1
+        bbox:
+          l: 21.3
+          t: 52.3
+          b: 476.2
+          r: 35.2
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+  - orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
+    text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
+    hash: 2349732 # uint64 hash of dloc
+    label: "title"
+    parent: null
+    children: [ ]
+    prov: # must exist, can be empty
+      - page_no: 1
+        bbox:
+          l: 65.0
+          t: 30.1
+          b: 53.4
+          r: 623.2
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+  - orig: "OPERATION (cont.)" # nested inside the figure
+    text: "OPERATION (cont.)"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2"
+    hash: 6978483
+    label: "section_header"
+    parent:
+      $ref: "/figures/0"
+    children: [ ]
+    prov:
+      - page_no: 1
+        bbox:
+          l: 323.0
+          t: 354.3
+          b: 334.4
+          r: 376.0
+        charspan: [ 0,734 ]
+  - orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
+    text: "Figure 1: Four examples of complex page layouts across different document categories"
+    dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3"
+    hash: 6978483
+    label: "caption"
+    parent:
+      $ref: "/figures/0"
+    children: [ ]
+    prov:
+      - page_no: 1
+        bbox:
+          l: 323.0
+          t: 354.3
+          b: 334.4
+          r: 376.0
+          coord_origin: "BOTTOMLEFT"
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+
+
+tables: # All tables...
+  - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
+    hash: 98574
+    label: "table"
+    parent: null
+    children: [ ]
+    caption:
+      $ref: "/texts/3"
+    references:
+      - $ref: "/text/??"
+    footnotes:
+      - $ref: "/text/??"
+    image:
+      format: png
+      dpi: 72
+      size:
+        width: 231
+        height: 351
+      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
+      #alternatives: base64 encoded striong
+    data: # TableData Type
+      grid: [ [ ] ] # list-of-list of TableCell type
+      otsl: "<fcel><ecel>..." # OTSL token string
+      html: "" # ??
+    prov:
+      - page_no: 1
+        bbox:
+          l: 323.0
+          t: 354.3
+          b: 334.4
+          r: 376.0
+          coord_origin: "BOTTOMLEFT"
+        charspan: [ 1,423 ] # 2-tuple, references to "orig"
+
+figures: # All figures...
+  - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
+    hash: 7782482
+    label: "figure"
+    parent: null
+    caption:
+      $ref: "/texts/2"
+    references:
+      - $ref: "/text/??"
+    footnotes:
+      - $ref: "/text/??"
+
+    data: # FigureData Type
+      classification: "illustration"
+      confidence: 0.78
+      description: "...."
+      # content structure?
+    image:
+      format: png
+      dpi: 72
+      size:
+        width: 231
+        height: 351
+      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
+      #alternatives: base64 encoded striong
+    children:
+      - $ref: "/texts/2"
+    prov:
+      - page_no: 1
+        bbox:
+          l: 456.3
+          t: 145.8
+          b: 623.4
+          r: 702.5
+        charspan: [ 0,288 ]
+
+key_value_items: [ ] # All KV-items
+
+# We should consider this for pages
+pages: # Optional, for layout documents
+  1:
+    hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e"
+    size:
+      width: 768.23
+      height: 583.15
+    image:
+      format: png
+      dpi: 144
+      size:
+        width: 1536
+        height: 1166
+      uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
+      #alternatives: base64 encoded string
+    num_elements: 23
\ No newline at end of file
diff --git a/test/test_newdoc.py b/test/test_newdoc.py
new file mode 100644
index 00000000..591abc6c
--- /dev/null
+++ b/test/test_newdoc.py
@@ -0,0 +1,33 @@
+import yaml
+from docling_core.types.newdoc.document import DoclingDocument
+
+if __name__ == "__main__":
+    # Read YAML file
+    with open("data/newdoc/dummy_doc.yaml", 'r') as fp:
+        dict_from_yaml = yaml.safe_load(fp)
+
+    doc = DoclingDocument.model_validate(dict_from_yaml)
+
+    # Objects can be accessed
+    text_item = doc.texts[0]
+
+    # access members
+    text_item.text
+    text_item.prov[0].page_no
+
+    # Objects that are references need explicit resolution for now:
+    obj = doc.body[2].resolve(doc=doc) # Text item with parent
+    parent = obj.parent.resolve(doc=doc) # it is a figure
+
+    obj2 = parent.children[0].resolve(doc=doc) # Child of figure must be the same as obj
+
+    assert obj == obj2
+    assert obj is obj2
+
+    doc_dumped = doc.model_dump(mode="json", by_alias=True)
+    out_yaml = yaml.safe_dump(doc_dumped)
+
+    doc_reload = DoclingDocument.model_validate(yaml.safe_load(out_yaml))
+
+    assert doc_reload == doc # must be equal
+    assert doc_reload is not doc # can't be identical