Skip to content

Commit

Permalink
Draft new docling document format, pydantic model and tests
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 17, 2024
1 parent 1ed846c commit 6dd4085
Show file tree
Hide file tree
Showing 5 changed files with 401 additions and 0 deletions.
Empty file.
125 changes: 125 additions & 0 deletions docling_core/types/newdoc/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import copy
import enum
import warnings
from enum import Enum, auto
from io import BytesIO
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union

from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self

from docling.backend.abstract_backend import PdfPageBackend

## All copied from docling
class CoordOrigin(enum.StrEnum):
TOPLEFT = "TOPLEFT"
BOTTOMLEFT = "BOTTOMLEFT"

class Size(BaseModel):
width: float = 0.0
height: float = 0.0


class BoundingBox(BaseModel):
l: float # left
t: float # top
r: float # right
b: float # bottom

coord_origin: CoordOrigin = CoordOrigin.TOPLEFT

@property
def width(self):
return self.r - self.l

@property
def height(self):
return abs(self.t - self.b)

def scaled(self, scale: float) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l *= scale
out_bbox.r *= scale
out_bbox.t *= scale
out_bbox.b *= scale

return out_bbox

def normalized(self, page_size: Size) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l /= page_size.width
out_bbox.r /= page_size.width
out_bbox.t /= page_size.height
out_bbox.b /= page_size.height

return out_bbox

def as_tuple(self):
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return (self.l, self.b, self.r, self.t)

@classmethod
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b < t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
elif origin == CoordOrigin.BOTTOMLEFT:
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b > t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)

def area(self) -> float:
return (self.r - self.l) * (self.b - self.t)

def intersection_area_with(self, other: "BoundingBox") -> float:
# Calculate intersection coordinates
left = max(self.l, other.l)
top = max(self.t, other.t)
right = min(self.r, other.r)
bottom = min(self.b, other.b)

# Calculate intersection dimensions
width = right - left
height = bottom - top

# If the bounding boxes do not overlap, width or height will be negative
if width <= 0 or height <= 0:
return 0.0

return width * height

def to_bottom_left_origin(self, page_height) -> "BoundingBox":
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
return self
elif self.coord_origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t,
b=page_height - self.b,
coord_origin=CoordOrigin.BOTTOMLEFT,
)

def to_top_left_origin(self, page_height):
if self.coord_origin == CoordOrigin.TOPLEFT:
return self
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t, # self.b
b=page_height - self.b, # self.t
coord_origin=CoordOrigin.TOPLEFT,
)
82 changes: 82 additions & 0 deletions docling_core/types/newdoc/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from typing import Any, List, Dict, Optional, Tuple, Union

from pydantic import BaseModel, Field, AnyUrl

from docling_core.types.newdoc.base import Size, BoundingBox

class FigureData(BaseModel): # TBD
pass

class TableData(BaseModel): # TBD
pass

class RefItem(BaseModel):
cref: str = Field(alias="$ref")

def resolve(self, doc: "DoclingDocument"):
_, path, index = self.cref.split("/")
index = int(index)
obj = doc.__getattribute__(path)[index]
return obj

class ImageRef(BaseModel):
format: str # png, etc.
dpi: int # ...
size: Size
uri: AnyUrl


class ProvenanceItem(BaseModel):
page_no: int
bbox: BoundingBox
charspan: Tuple[int, int]


class DocItem(BaseModel):
dloc: str # format spec ({document_hash}{json-path})
hash: int
label: str
parent: Optional[RefItem]
children: List[RefItem]
prov: List[ProvenanceItem]

class TextItem(DocItem):
orig: str # untreated representation
text: str # sanitized representation

class FloatingItem(DocItem):
caption: Optional[Union[RefItem, TextItem]]
references: List[Union[RefItem, TextItem]]
footnotes: List[Union[RefItem, TextItem]]
data: Any
image: Optional[ImageRef]

class FigureItem(DocItem):
data: FigureData

class TableItem(DocItem):
data: TableData

class KeyValueItem(DocItem):
pass

ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]

class DocumentContent(BaseModel):
furniture: List[Union[RefItem, ContentItem]] = []
body: List[Union[RefItem, ContentItem]] = []
texts: List[TextItem] = []
figures: List[FigureItem] = []
tables: List[TableItem] = []
key_value_items: List[KeyValueItem] = []

class PageItem(DocumentContent):
hash: str # page hash
size: Size
image: Optional[ImageRef]
num_elements: int

class DoclingDocument(DocumentContent):
description: Any
file_info: Any
pages: Dict[int, PageItem] = {} # empty as default
169 changes: 169 additions & 0 deletions test/data/newdoc/dummy_doc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
---
## Document with content + layout info
description: { } # DescriptionType - TBD
file_info: # FileInfoType - TBD
document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
furniture: # Headers, footers, framing, navigation elements, all other non-body text
- $ref: "/texts/0"

body: # All elements in other arrays, by-reference only
- $ref: "/texts/1"
- $ref: "/figure/0"
- $ref: "/texts/2"
- $ref: "/texts/3"
- $ref: "/tables/0"

texts: # All elements that have a text-string representation, with actual data
- orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
hash: 132103230
label: "page_header"
parent: null
children: [ ]
prov:
- page_no: 1
bbox:
l: 21.3
t: 52.3
b: 476.2
r: 35.2
charspan: [ 1,423 ] # 2-tuple, references to "orig"
- orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
hash: 2349732 # uint64 hash of dloc
label: "title"
parent: null
children: [ ]
prov: # must exist, can be empty
- page_no: 1
bbox:
l: 65.0
t: 30.1
b: 53.4
r: 623.2
charspan: [ 1,423 ] # 2-tuple, references to "orig"
- orig: "OPERATION (cont.)" # nested inside the figure
text: "OPERATION (cont.)"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2"
hash: 6978483
label: "section_header"
parent:
$ref: "/figures/0"
children: [ ]
prov:
- page_no: 1
bbox:
l: 323.0
t: 354.3
b: 334.4
r: 376.0
charspan: [ 0,734 ]
- orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
text: "Figure 1: Four examples of complex page layouts across different document categories"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3"
hash: 6978483
label: "caption"
parent:
$ref: "/figures/0"
children: [ ]
prov:
- page_no: 1
bbox:
l: 323.0
t: 354.3
b: 334.4
r: 376.0
coord_origin: "BOTTOMLEFT"
charspan: [ 1,423 ] # 2-tuple, references to "orig"


tables: # All tables...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
hash: 98574
label: "table"
parent: null
children: [ ]
caption:
$ref: "/texts/3"
references:
- $ref: "/text/??"
footnotes:
- $ref: "/text/??"
image:
format: png
dpi: 72
size:
width: 231
height: 351
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
#alternatives: base64 encoded striong
data: # TableData Type
grid: [ [ ] ] # list-of-list of TableCell type
otsl: "<fcel><ecel>..." # OTSL token string
html: "" # ??
prov:
- page_no: 1
bbox:
l: 323.0
t: 354.3
b: 334.4
r: 376.0
coord_origin: "BOTTOMLEFT"
charspan: [ 1,423 ] # 2-tuple, references to "orig"

figures: # All figures...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
hash: 7782482
label: "figure"
parent: null
caption:
$ref: "/texts/2"
references:
- $ref: "/text/??"
footnotes:
- $ref: "/text/??"

data: # FigureData Type
classification: "illustration"
confidence: 0.78
description: "...."
# content structure?
image:
format: png
dpi: 72
size:
width: 231
height: 351
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
#alternatives: base64 encoded striong
children:
- $ref: "/texts/2"
prov:
- page_no: 1
bbox:
l: 456.3
t: 145.8
b: 623.4
r: 702.5
charspan: [ 0,288 ]

key_value_items: [ ] # All KV-items

# We should consider this for pages
pages: # Optional, for layout documents
1:
hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e"
size:
width: 768.23
height: 583.15
image:
format: png
dpi: 144
size:
width: 1536
height: 1166
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
#alternatives: base64 encoded string
num_elements: 23
Loading

0 comments on commit 6dd4085

Please sign in to comment.