Skip to content

Commit

Permalink
Draft new docling document format, pydantic model and tests
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 17, 2024
1 parent 1ed846c commit 334e1b7
Show file tree
Hide file tree
Showing 5 changed files with 403 additions and 0 deletions.
Empty file.
119 changes: 119 additions & 0 deletions docling_core/types/newdoc/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import copy
import enum
from typing import Tuple

from pydantic import BaseModel


## All copied from docling
class CoordOrigin(enum.StrEnum):
TOPLEFT = "TOPLEFT"
BOTTOMLEFT = "BOTTOMLEFT"

class Size(BaseModel):
width: float = 0.0
height: float = 0.0


class BoundingBox(BaseModel):
l: float # left
t: float # top
r: float # right
b: float # bottom

coord_origin: CoordOrigin = CoordOrigin.TOPLEFT

@property
def width(self):
return self.r - self.l

@property
def height(self):
return abs(self.t - self.b)

def scaled(self, scale: float) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l *= scale
out_bbox.r *= scale
out_bbox.t *= scale
out_bbox.b *= scale

return out_bbox

def normalized(self, page_size: Size) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l /= page_size.width
out_bbox.r /= page_size.width
out_bbox.t /= page_size.height
out_bbox.b /= page_size.height

return out_bbox

def as_tuple(self):
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return (self.l, self.b, self.r, self.t)

@classmethod
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b < t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
elif origin == CoordOrigin.BOTTOMLEFT:
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b > t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)

def area(self) -> float:
return (self.r - self.l) * (self.b - self.t)

def intersection_area_with(self, other: "BoundingBox") -> float:
# Calculate intersection coordinates
left = max(self.l, other.l)
top = max(self.t, other.t)
right = min(self.r, other.r)
bottom = min(self.b, other.b)

# Calculate intersection dimensions
width = right - left
height = bottom - top

# If the bounding boxes do not overlap, width or height will be negative
if width <= 0 or height <= 0:
return 0.0

return width * height

def to_bottom_left_origin(self, page_height) -> "BoundingBox":
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
return self
elif self.coord_origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t,
b=page_height - self.b,
coord_origin=CoordOrigin.BOTTOMLEFT,
)

def to_top_left_origin(self, page_height):
if self.coord_origin == CoordOrigin.TOPLEFT:
return self
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t, # self.b
b=page_height - self.b, # self.t
coord_origin=CoordOrigin.TOPLEFT,
)
82 changes: 82 additions & 0 deletions docling_core/types/newdoc/document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from typing import Any, List, Dict, Optional, Tuple, Union

from pydantic import BaseModel, Field, AnyUrl

from docling_core.types.newdoc.base import Size, BoundingBox

class FigureData(BaseModel): # TBD
pass

class TableData(BaseModel): # TBD
pass

class RefItem(BaseModel):
cref: str = Field(alias="$ref")

def resolve(self, doc: "DoclingDocument"):
_, path, index = self.cref.split("/")
index = int(index)
obj = doc.__getattribute__(path)[index]
return obj

class ImageRef(BaseModel):
format: str # png, etc.
dpi: int # ...
size: Size
uri: AnyUrl


class ProvenanceItem(BaseModel):
page_no: int
bbox: BoundingBox
charspan: Tuple[int, int]


class DocItem(BaseModel):
dloc: str # format spec ({document_hash}{json-path})
hash: int
label: str
parent: Optional[RefItem]
children: List[RefItem]
prov: List[ProvenanceItem]

class TextItem(DocItem):
orig: str # untreated representation
text: str # sanitized representation

class FloatingItem(DocItem):
caption: Optional[Union[RefItem, TextItem]]
references: List[Union[RefItem, TextItem]]
footnotes: List[Union[RefItem, TextItem]]
data: Any
image: Optional[ImageRef]

class FigureItem(DocItem):
data: FigureData

class TableItem(DocItem):
data: TableData

class KeyValueItem(DocItem):
pass

ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]

class DocumentContent(BaseModel):
furniture: List[RefItem] = []
body: List[RefItem] = []
texts: List[TextItem] = []
figures: List[FigureItem] = []
tables: List[TableItem] = []
key_value_items: List[KeyValueItem] = []

class PageItem(DocumentContent):
hash: str # page hash
size: Size
image: Optional[ImageRef]
num_elements: int

class DoclingDocument(DocumentContent):
description: Any
file_info: Any
pages: Dict[int, PageItem] = {} # empty as default
169 changes: 169 additions & 0 deletions test/data/newdoc/dummy_doc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
---
## Document with content + layout info
description: { } # DescriptionType - TBD
file_info: # FileInfoType - TBD
document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
furniture: # Headers, footers, framing, navigation elements, all other non-body text
- $ref: "/texts/0"

body: # All elements in other arrays, by-reference only
- $ref: "/texts/1"
- $ref: "/figure/0"
- $ref: "/texts/2"
- $ref: "/texts/3"
- $ref: "/tables/0"

texts: # All elements that have a text-string representation, with actual data
- orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
hash: 132103230
label: "page_header"
parent: null
children: [ ]
prov:
- page_no: 1
bbox:
l: 21.3
t: 52.3
b: 476.2
r: 35.2
charspan: [ 1,423 ] # 2-tuple, references to "orig"
- orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
hash: 2349732 # uint64 hash of dloc
label: "title"
parent: null
children: [ ]
prov: # must exist, can be empty
- page_no: 1
bbox:
l: 65.0
t: 30.1
b: 53.4
r: 623.2
charspan: [ 1,423 ] # 2-tuple, references to "orig"
- orig: "OPERATION (cont.)" # nested inside the figure
text: "OPERATION (cont.)"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2"
hash: 6978483
label: "section_header"
parent:
$ref: "/figures/0"
children: [ ]
prov:
- page_no: 1
bbox:
l: 323.0
t: 354.3
b: 334.4
r: 376.0
charspan: [ 0,734 ]
- orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
text: "Figure 1: Four examples of complex page layouts across different document categories"
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3"
hash: 6978483
label: "caption"
parent:
$ref: "/figures/0"
children: [ ]
prov:
- page_no: 1
bbox:
l: 323.0
t: 354.3
b: 334.4
r: 376.0
coord_origin: "BOTTOMLEFT"
charspan: [ 1,423 ] # 2-tuple, references to "orig"


tables: # All tables...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
hash: 98574
label: "table"
parent: null
children: [ ]
caption:
$ref: "/texts/3"
references:
- $ref: "/text/??"
footnotes:
- $ref: "/text/??"
image:
format: png
dpi: 72
size:
width: 231
height: 351
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
#alternatives: base64 encoded striong
data: # TableData Type
grid: [ [ ] ] # list-of-list of TableCell type
otsl: "<fcel><ecel>..." # OTSL token string
html: "" # ??
prov:
- page_no: 1
bbox:
l: 323.0
t: 354.3
b: 334.4
r: 376.0
coord_origin: "BOTTOMLEFT"
charspan: [ 1,423 ] # 2-tuple, references to "orig"

figures: # All figures...
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
hash: 7782482
label: "figure"
parent: null
caption:
$ref: "/texts/2"
references:
- $ref: "/text/??"
footnotes:
- $ref: "/text/??"

data: # FigureData Type
classification: "illustration"
confidence: 0.78
description: "...."
# content structure?
image:
format: png
dpi: 72
size:
width: 231
height: 351
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
#alternatives: base64 encoded striong
children:
- $ref: "/texts/2"
prov:
- page_no: 1
bbox:
l: 456.3
t: 145.8
b: 623.4
r: 702.5
charspan: [ 0,288 ]

key_value_items: [ ] # All KV-items

# We should consider this for pages
pages: # Optional, for layout documents
1:
hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e"
size:
width: 768.23
height: 583.15
image:
format: png
dpi: 144
size:
width: 1536
height: 1166
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
#alternatives: base64 encoded string
num_elements: 23
Loading

0 comments on commit 334e1b7

Please sign in to comment.