From 334e1b7f212fdac84b10fea54140127ddf90ae4c Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 17 Sep 2024 15:13:32 +0200 Subject: [PATCH] Draft new docling document format, pydantic model and tests Signed-off-by: Christoph Auer --- docling_core/types/newdoc/__init__.py | 0 docling_core/types/newdoc/base.py | 119 ++++++++++++++++++ docling_core/types/newdoc/document.py | 82 +++++++++++++ test/data/newdoc/dummy_doc.yaml | 169 ++++++++++++++++++++++++++ test/test_newdoc.py | 33 +++++ 5 files changed, 403 insertions(+) create mode 100644 docling_core/types/newdoc/__init__.py create mode 100644 docling_core/types/newdoc/base.py create mode 100644 docling_core/types/newdoc/document.py create mode 100644 test/data/newdoc/dummy_doc.yaml create mode 100644 test/test_newdoc.py diff --git a/docling_core/types/newdoc/__init__.py b/docling_core/types/newdoc/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling_core/types/newdoc/base.py b/docling_core/types/newdoc/base.py new file mode 100644 index 00000000..c30dfd04 --- /dev/null +++ b/docling_core/types/newdoc/base.py @@ -0,0 +1,119 @@ +import copy +import enum +from typing import Tuple + +from pydantic import BaseModel + + +## All copied from docling +class CoordOrigin(enum.StrEnum): + TOPLEFT = "TOPLEFT" + BOTTOMLEFT = "BOTTOMLEFT" + +class Size(BaseModel): + width: float = 0.0 + height: float = 0.0 + + +class BoundingBox(BaseModel): + l: float # left + t: float # top + r: float # right + b: float # bottom + + coord_origin: CoordOrigin = CoordOrigin.TOPLEFT + + @property + def width(self): + return self.r - self.l + + @property + def height(self): + return abs(self.t - self.b) + + def scaled(self, scale: float) -> "BoundingBox": + out_bbox = copy.deepcopy(self) + out_bbox.l *= scale + out_bbox.r *= scale + out_bbox.t *= scale + out_bbox.b *= scale + + return out_bbox + + def normalized(self, page_size: Size) -> "BoundingBox": + out_bbox = copy.deepcopy(self) + out_bbox.l /= page_size.width + out_bbox.r /= page_size.width + out_bbox.t /= page_size.height + out_bbox.b /= page_size.height + + return out_bbox + + def as_tuple(self): + if self.coord_origin == CoordOrigin.TOPLEFT: + return (self.l, self.t, self.r, self.b) + elif self.coord_origin == CoordOrigin.BOTTOMLEFT: + return (self.l, self.b, self.r, self.t) + + @classmethod + def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin): + if origin == CoordOrigin.TOPLEFT: + l, t, r, b = coord[0], coord[1], coord[2], coord[3] + if r < l: + l, r = r, l + if b < t: + b, t = t, b + + return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin) + elif origin == CoordOrigin.BOTTOMLEFT: + l, b, r, t = coord[0], coord[1], coord[2], coord[3] + if r < l: + l, r = r, l + if b > t: + b, t = t, b + + return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin) + + def area(self) -> float: + return (self.r - self.l) * (self.b - self.t) + + def intersection_area_with(self, other: "BoundingBox") -> float: + # Calculate intersection coordinates + left = max(self.l, other.l) + top = max(self.t, other.t) + right = min(self.r, other.r) + bottom = min(self.b, other.b) + + # Calculate intersection dimensions + width = right - left + height = bottom - top + + # If the bounding boxes do not overlap, width or height will be negative + if width <= 0 or height <= 0: + return 0.0 + + return width * height + + def to_bottom_left_origin(self, page_height) -> "BoundingBox": + if self.coord_origin == CoordOrigin.BOTTOMLEFT: + return self + elif self.coord_origin == CoordOrigin.TOPLEFT: + return BoundingBox( + l=self.l, + r=self.r, + t=page_height - self.t, + b=page_height - self.b, + coord_origin=CoordOrigin.BOTTOMLEFT, + ) + + def to_top_left_origin(self, page_height): + if self.coord_origin == CoordOrigin.TOPLEFT: + return self + elif self.coord_origin == CoordOrigin.BOTTOMLEFT: + return BoundingBox( + l=self.l, + r=self.r, + t=page_height - self.t, # self.b + b=page_height - self.b, # self.t + coord_origin=CoordOrigin.TOPLEFT, + ) diff --git a/docling_core/types/newdoc/document.py b/docling_core/types/newdoc/document.py new file mode 100644 index 00000000..dbcdc714 --- /dev/null +++ b/docling_core/types/newdoc/document.py @@ -0,0 +1,82 @@ +from typing import Any, List, Dict, Optional, Tuple, Union + +from pydantic import BaseModel, Field, AnyUrl + +from docling_core.types.newdoc.base import Size, BoundingBox + +class FigureData(BaseModel): # TBD + pass + +class TableData(BaseModel): # TBD + pass + +class RefItem(BaseModel): + cref: str = Field(alias="$ref") + + def resolve(self, doc: "DoclingDocument"): + _, path, index = self.cref.split("/") + index = int(index) + obj = doc.__getattribute__(path)[index] + return obj + +class ImageRef(BaseModel): + format: str # png, etc. + dpi: int # ... + size: Size + uri: AnyUrl + + +class ProvenanceItem(BaseModel): + page_no: int + bbox: BoundingBox + charspan: Tuple[int, int] + + +class DocItem(BaseModel): + dloc: str # format spec ({document_hash}{json-path}) + hash: int + label: str + parent: Optional[RefItem] + children: List[RefItem] + prov: List[ProvenanceItem] + +class TextItem(DocItem): + orig: str # untreated representation + text: str # sanitized representation + +class FloatingItem(DocItem): + caption: Optional[Union[RefItem, TextItem]] + references: List[Union[RefItem, TextItem]] + footnotes: List[Union[RefItem, TextItem]] + data: Any + image: Optional[ImageRef] + +class FigureItem(DocItem): + data: FigureData + +class TableItem(DocItem): + data: TableData + +class KeyValueItem(DocItem): + pass + +ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem] + +class DocumentContent(BaseModel): + furniture: List[RefItem] = [] + body: List[RefItem] = [] + texts: List[TextItem] = [] + figures: List[FigureItem] = [] + tables: List[TableItem] = [] + key_value_items: List[KeyValueItem] = [] + +class PageItem(DocumentContent): + hash: str # page hash + size: Size + image: Optional[ImageRef] + num_elements: int + +class DoclingDocument(DocumentContent): + description: Any + file_info: Any + pages: Dict[int, PageItem] = {} # empty as default diff --git a/test/data/newdoc/dummy_doc.yaml b/test/data/newdoc/dummy_doc.yaml new file mode 100644 index 00000000..f092eb81 --- /dev/null +++ b/test/data/newdoc/dummy_doc.yaml @@ -0,0 +1,169 @@ +--- +## Document with content + layout info +description: { } # DescriptionType - TBD +file_info: # FileInfoType - TBD + document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5 +furniture: # Headers, footers, framing, navigation elements, all other non-body text + - $ref: "/texts/0" + +body: # All elements in other arrays, by-reference only + - $ref: "/texts/1" + - $ref: "/figure/0" + - $ref: "/texts/2" + - $ref: "/texts/3" + - $ref: "/tables/0" + +texts: # All elements that have a text-string representation, with actual data + - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022" + text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022" + dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0" + hash: 132103230 + label: "page_header" + parent: null + children: [ ] + prov: + - page_no: 1 + bbox: + l: 21.3 + t: 52.3 + b: 476.2 + r: 35.2 + charspan: [ 1,423 ] # 2-tuple, references to "orig" + - orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis" + text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" + dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1" + hash: 2349732 # uint64 hash of dloc + label: "title" + parent: null + children: [ ] + prov: # must exist, can be empty + - page_no: 1 + bbox: + l: 65.0 + t: 30.1 + b: 53.4 + r: 623.2 + charspan: [ 1,423 ] # 2-tuple, references to "orig" + - orig: "OPERATION (cont.)" # nested inside the figure + text: "OPERATION (cont.)" + dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2" + hash: 6978483 + label: "section_header" + parent: + $ref: "/figures/0" + children: [ ] + prov: + - page_no: 1 + bbox: + l: 323.0 + t: 354.3 + b: 334.4 + r: 376.0 + charspan: [ 0,734 ] + - orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure + text: "Figure 1: Four examples of complex page layouts across different document categories" + dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3" + hash: 6978483 + label: "caption" + parent: + $ref: "/figures/0" + children: [ ] + prov: + - page_no: 1 + bbox: + l: 323.0 + t: 354.3 + b: 334.4 + r: 376.0 + coord_origin: "BOTTOMLEFT" + charspan: [ 1,423 ] # 2-tuple, references to "orig" + + +tables: # All tables... + - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0" + hash: 98574 + label: "table" + parent: null + children: [ ] + caption: + $ref: "/texts/3" + references: + - $ref: "/text/??" + footnotes: + - $ref: "/text/??" + image: + format: png + dpi: 72 + size: + width: 231 + height: 351 + uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png" + #alternatives: base64 encoded striong + data: # TableData Type + grid: [ [ ] ] # list-of-list of TableCell type + otsl: "..." # OTSL token string + html: "" # ?? + prov: + - page_no: 1 + bbox: + l: 323.0 + t: 354.3 + b: 334.4 + r: 376.0 + coord_origin: "BOTTOMLEFT" + charspan: [ 1,423 ] # 2-tuple, references to "orig" + +figures: # All figures... + - dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0" + hash: 7782482 + label: "figure" + parent: null + caption: + $ref: "/texts/2" + references: + - $ref: "/text/??" + footnotes: + - $ref: "/text/??" + + data: # FigureData Type + classification: "illustration" + confidence: 0.78 + description: "...." + # content structure? + image: + format: png + dpi: 72 + size: + width: 231 + height: 351 + uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png" + #alternatives: base64 encoded striong + children: + - $ref: "/texts/2" + prov: + - page_no: 1 + bbox: + l: 456.3 + t: 145.8 + b: 623.4 + r: 702.5 + charspan: [ 0,288 ] + +key_value_items: [ ] # All KV-items + +# We should consider this for pages +pages: # Optional, for layout documents + 1: + hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e" + size: + width: 768.23 + height: 583.15 + image: + format: png + dpi: 144 + size: + width: 1536 + height: 1166 + uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png" + #alternatives: base64 encoded string + num_elements: 23 \ No newline at end of file diff --git a/test/test_newdoc.py b/test/test_newdoc.py new file mode 100644 index 00000000..591abc6c --- /dev/null +++ b/test/test_newdoc.py @@ -0,0 +1,33 @@ +import yaml +from docling_core.types.newdoc.document import DoclingDocument + +if __name__ == "__main__": + # Read YAML file + with open("data/newdoc/dummy_doc.yaml", 'r') as fp: + dict_from_yaml = yaml.safe_load(fp) + + doc = DoclingDocument.model_validate(dict_from_yaml) + + # Objects can be accessed + text_item = doc.texts[0] + + # access members + text_item.text + text_item.prov[0].page_no + + # Objects that are references need explicit resolution for now: + obj = doc.body[2].resolve(doc=doc) # Text item with parent + parent = obj.parent.resolve(doc=doc) # it is a figure + + obj2 = parent.children[0].resolve(doc=doc) # Child of figure must be the same as obj + + assert obj == obj2 + assert obj is obj2 + + doc_dumped = doc.model_dump(mode="json", by_alias=True) + out_yaml = yaml.safe_dump(doc_dumped) + + doc_reload = DoclingDocument.model_validate(yaml.safe_load(out_yaml)) + + assert doc_reload == doc # must be equal + assert doc_reload is not doc # can't be identical