Skip to content

Commit 334e1b7

Browse files
committed
Draft new docling document format, pydantic model and tests
Signed-off-by: Christoph Auer <[email protected]>
1 parent 1ed846c commit 334e1b7

File tree

5 files changed

+403
-0
lines changed

5 files changed

+403
-0
lines changed

docling_core/types/newdoc/__init__.py

Whitespace-only changes.

docling_core/types/newdoc/base.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import copy
2+
import enum
3+
from typing import Tuple
4+
5+
from pydantic import BaseModel
6+
7+
8+
## All copied from docling
9+
class CoordOrigin(enum.StrEnum):
10+
TOPLEFT = "TOPLEFT"
11+
BOTTOMLEFT = "BOTTOMLEFT"
12+
13+
class Size(BaseModel):
14+
width: float = 0.0
15+
height: float = 0.0
16+
17+
18+
class BoundingBox(BaseModel):
19+
l: float # left
20+
t: float # top
21+
r: float # right
22+
b: float # bottom
23+
24+
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT
25+
26+
@property
27+
def width(self):
28+
return self.r - self.l
29+
30+
@property
31+
def height(self):
32+
return abs(self.t - self.b)
33+
34+
def scaled(self, scale: float) -> "BoundingBox":
35+
out_bbox = copy.deepcopy(self)
36+
out_bbox.l *= scale
37+
out_bbox.r *= scale
38+
out_bbox.t *= scale
39+
out_bbox.b *= scale
40+
41+
return out_bbox
42+
43+
def normalized(self, page_size: Size) -> "BoundingBox":
44+
out_bbox = copy.deepcopy(self)
45+
out_bbox.l /= page_size.width
46+
out_bbox.r /= page_size.width
47+
out_bbox.t /= page_size.height
48+
out_bbox.b /= page_size.height
49+
50+
return out_bbox
51+
52+
def as_tuple(self):
53+
if self.coord_origin == CoordOrigin.TOPLEFT:
54+
return (self.l, self.t, self.r, self.b)
55+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
56+
return (self.l, self.b, self.r, self.t)
57+
58+
@classmethod
59+
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
60+
if origin == CoordOrigin.TOPLEFT:
61+
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
62+
if r < l:
63+
l, r = r, l
64+
if b < t:
65+
b, t = t, b
66+
67+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
68+
elif origin == CoordOrigin.BOTTOMLEFT:
69+
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
70+
if r < l:
71+
l, r = r, l
72+
if b > t:
73+
b, t = t, b
74+
75+
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
76+
77+
def area(self) -> float:
78+
return (self.r - self.l) * (self.b - self.t)
79+
80+
def intersection_area_with(self, other: "BoundingBox") -> float:
81+
# Calculate intersection coordinates
82+
left = max(self.l, other.l)
83+
top = max(self.t, other.t)
84+
right = min(self.r, other.r)
85+
bottom = min(self.b, other.b)
86+
87+
# Calculate intersection dimensions
88+
width = right - left
89+
height = bottom - top
90+
91+
# If the bounding boxes do not overlap, width or height will be negative
92+
if width <= 0 or height <= 0:
93+
return 0.0
94+
95+
return width * height
96+
97+
def to_bottom_left_origin(self, page_height) -> "BoundingBox":
98+
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
99+
return self
100+
elif self.coord_origin == CoordOrigin.TOPLEFT:
101+
return BoundingBox(
102+
l=self.l,
103+
r=self.r,
104+
t=page_height - self.t,
105+
b=page_height - self.b,
106+
coord_origin=CoordOrigin.BOTTOMLEFT,
107+
)
108+
109+
def to_top_left_origin(self, page_height):
110+
if self.coord_origin == CoordOrigin.TOPLEFT:
111+
return self
112+
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
113+
return BoundingBox(
114+
l=self.l,
115+
r=self.r,
116+
t=page_height - self.t, # self.b
117+
b=page_height - self.b, # self.t
118+
coord_origin=CoordOrigin.TOPLEFT,
119+
)

docling_core/types/newdoc/document.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
from typing import Any, List, Dict, Optional, Tuple, Union
2+
3+
from pydantic import BaseModel, Field, AnyUrl
4+
5+
from docling_core.types.newdoc.base import Size, BoundingBox
6+
7+
class FigureData(BaseModel): # TBD
8+
pass
9+
10+
class TableData(BaseModel): # TBD
11+
pass
12+
13+
class RefItem(BaseModel):
14+
cref: str = Field(alias="$ref")
15+
16+
def resolve(self, doc: "DoclingDocument"):
17+
_, path, index = self.cref.split("/")
18+
index = int(index)
19+
obj = doc.__getattribute__(path)[index]
20+
return obj
21+
22+
class ImageRef(BaseModel):
23+
format: str # png, etc.
24+
dpi: int # ...
25+
size: Size
26+
uri: AnyUrl
27+
28+
29+
class ProvenanceItem(BaseModel):
30+
page_no: int
31+
bbox: BoundingBox
32+
charspan: Tuple[int, int]
33+
34+
35+
class DocItem(BaseModel):
36+
dloc: str # format spec ({document_hash}{json-path})
37+
hash: int
38+
label: str
39+
parent: Optional[RefItem]
40+
children: List[RefItem]
41+
prov: List[ProvenanceItem]
42+
43+
class TextItem(DocItem):
44+
orig: str # untreated representation
45+
text: str # sanitized representation
46+
47+
class FloatingItem(DocItem):
48+
caption: Optional[Union[RefItem, TextItem]]
49+
references: List[Union[RefItem, TextItem]]
50+
footnotes: List[Union[RefItem, TextItem]]
51+
data: Any
52+
image: Optional[ImageRef]
53+
54+
class FigureItem(DocItem):
55+
data: FigureData
56+
57+
class TableItem(DocItem):
58+
data: TableData
59+
60+
class KeyValueItem(DocItem):
61+
pass
62+
63+
ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem]
64+
65+
class DocumentContent(BaseModel):
66+
furniture: List[RefItem] = []
67+
body: List[RefItem] = []
68+
texts: List[TextItem] = []
69+
figures: List[FigureItem] = []
70+
tables: List[TableItem] = []
71+
key_value_items: List[KeyValueItem] = []
72+
73+
class PageItem(DocumentContent):
74+
hash: str # page hash
75+
size: Size
76+
image: Optional[ImageRef]
77+
num_elements: int
78+
79+
class DoclingDocument(DocumentContent):
80+
description: Any
81+
file_info: Any
82+
pages: Dict[int, PageItem] = {} # empty as default

test/data/newdoc/dummy_doc.yaml

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
---
2+
## Document with content + layout info
3+
description: { } # DescriptionType - TBD
4+
file_info: # FileInfoType - TBD
5+
document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5
6+
furniture: # Headers, footers, framing, navigation elements, all other non-body text
7+
- $ref: "/texts/0"
8+
9+
body: # All elements in other arrays, by-reference only
10+
- $ref: "/texts/1"
11+
- $ref: "/figure/0"
12+
- $ref: "/texts/2"
13+
- $ref: "/texts/3"
14+
- $ref: "/tables/0"
15+
16+
texts: # All elements that have a text-string representation, with actual data
17+
- orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
18+
text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022"
19+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0"
20+
hash: 132103230
21+
label: "page_header"
22+
parent: null
23+
children: [ ]
24+
prov:
25+
- page_no: 1
26+
bbox:
27+
l: 21.3
28+
t: 52.3
29+
b: 476.2
30+
r: 35.2
31+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
32+
- orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis"
33+
text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
34+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1"
35+
hash: 2349732 # uint64 hash of dloc
36+
label: "title"
37+
parent: null
38+
children: [ ]
39+
prov: # must exist, can be empty
40+
- page_no: 1
41+
bbox:
42+
l: 65.0
43+
t: 30.1
44+
b: 53.4
45+
r: 623.2
46+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
47+
- orig: "OPERATION (cont.)" # nested inside the figure
48+
text: "OPERATION (cont.)"
49+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2"
50+
hash: 6978483
51+
label: "section_header"
52+
parent:
53+
$ref: "/figures/0"
54+
children: [ ]
55+
prov:
56+
- page_no: 1
57+
bbox:
58+
l: 323.0
59+
t: 354.3
60+
b: 334.4
61+
r: 376.0
62+
charspan: [ 0,734 ]
63+
- orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure
64+
text: "Figure 1: Four examples of complex page layouts across different document categories"
65+
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3"
66+
hash: 6978483
67+
label: "caption"
68+
parent:
69+
$ref: "/figures/0"
70+
children: [ ]
71+
prov:
72+
- page_no: 1
73+
bbox:
74+
l: 323.0
75+
t: 354.3
76+
b: 334.4
77+
r: 376.0
78+
coord_origin: "BOTTOMLEFT"
79+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
80+
81+
82+
tables: # All tables...
83+
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0"
84+
hash: 98574
85+
label: "table"
86+
parent: null
87+
children: [ ]
88+
caption:
89+
$ref: "/texts/3"
90+
references:
91+
- $ref: "/text/??"
92+
footnotes:
93+
- $ref: "/text/??"
94+
image:
95+
format: png
96+
dpi: 72
97+
size:
98+
width: 231
99+
height: 351
100+
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png"
101+
#alternatives: base64 encoded striong
102+
data: # TableData Type
103+
grid: [ [ ] ] # list-of-list of TableCell type
104+
otsl: "<fcel><ecel>..." # OTSL token string
105+
html: "" # ??
106+
prov:
107+
- page_no: 1
108+
bbox:
109+
l: 323.0
110+
t: 354.3
111+
b: 334.4
112+
r: 376.0
113+
coord_origin: "BOTTOMLEFT"
114+
charspan: [ 1,423 ] # 2-tuple, references to "orig"
115+
116+
figures: # All figures...
117+
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0"
118+
hash: 7782482
119+
label: "figure"
120+
parent: null
121+
caption:
122+
$ref: "/texts/2"
123+
references:
124+
- $ref: "/text/??"
125+
footnotes:
126+
- $ref: "/text/??"
127+
128+
data: # FigureData Type
129+
classification: "illustration"
130+
confidence: 0.78
131+
description: "...."
132+
# content structure?
133+
image:
134+
format: png
135+
dpi: 72
136+
size:
137+
width: 231
138+
height: 351
139+
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png"
140+
#alternatives: base64 encoded striong
141+
children:
142+
- $ref: "/texts/2"
143+
prov:
144+
- page_no: 1
145+
bbox:
146+
l: 456.3
147+
t: 145.8
148+
b: 623.4
149+
r: 702.5
150+
charspan: [ 0,288 ]
151+
152+
key_value_items: [ ] # All KV-items
153+
154+
# We should consider this for pages
155+
pages: # Optional, for layout documents
156+
1:
157+
hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e"
158+
size:
159+
width: 768.23
160+
height: 583.15
161+
image:
162+
format: png
163+
dpi: 144
164+
size:
165+
width: 1536
166+
height: 1166
167+
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png"
168+
#alternatives: base64 encoded string
169+
num_elements: 23

0 commit comments

Comments
 (0)