-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Draft new docling document format, pydantic model and tests
Signed-off-by: Christoph Auer <[email protected]>
- Loading branch information
Showing
5 changed files
with
401 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import copy | ||
import enum | ||
import warnings | ||
from enum import Enum, auto | ||
from io import BytesIO | ||
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union | ||
|
||
from PIL.Image import Image | ||
from pydantic import BaseModel, ConfigDict, Field, model_validator | ||
from typing_extensions import Self | ||
|
||
from docling.backend.abstract_backend import PdfPageBackend | ||
|
||
## All copied from docling | ||
class CoordOrigin(enum.StrEnum): | ||
TOPLEFT = "TOPLEFT" | ||
BOTTOMLEFT = "BOTTOMLEFT" | ||
|
||
class Size(BaseModel): | ||
width: float = 0.0 | ||
height: float = 0.0 | ||
|
||
|
||
class BoundingBox(BaseModel): | ||
l: float # left | ||
t: float # top | ||
r: float # right | ||
b: float # bottom | ||
|
||
coord_origin: CoordOrigin = CoordOrigin.TOPLEFT | ||
|
||
@property | ||
def width(self): | ||
return self.r - self.l | ||
|
||
@property | ||
def height(self): | ||
return abs(self.t - self.b) | ||
|
||
def scaled(self, scale: float) -> "BoundingBox": | ||
out_bbox = copy.deepcopy(self) | ||
out_bbox.l *= scale | ||
out_bbox.r *= scale | ||
out_bbox.t *= scale | ||
out_bbox.b *= scale | ||
|
||
return out_bbox | ||
|
||
def normalized(self, page_size: Size) -> "BoundingBox": | ||
out_bbox = copy.deepcopy(self) | ||
out_bbox.l /= page_size.width | ||
out_bbox.r /= page_size.width | ||
out_bbox.t /= page_size.height | ||
out_bbox.b /= page_size.height | ||
|
||
return out_bbox | ||
|
||
def as_tuple(self): | ||
if self.coord_origin == CoordOrigin.TOPLEFT: | ||
return (self.l, self.t, self.r, self.b) | ||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT: | ||
return (self.l, self.b, self.r, self.t) | ||
|
||
@classmethod | ||
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin): | ||
if origin == CoordOrigin.TOPLEFT: | ||
l, t, r, b = coord[0], coord[1], coord[2], coord[3] | ||
if r < l: | ||
l, r = r, l | ||
if b < t: | ||
b, t = t, b | ||
|
||
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin) | ||
elif origin == CoordOrigin.BOTTOMLEFT: | ||
l, b, r, t = coord[0], coord[1], coord[2], coord[3] | ||
if r < l: | ||
l, r = r, l | ||
if b > t: | ||
b, t = t, b | ||
|
||
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin) | ||
|
||
def area(self) -> float: | ||
return (self.r - self.l) * (self.b - self.t) | ||
|
||
def intersection_area_with(self, other: "BoundingBox") -> float: | ||
# Calculate intersection coordinates | ||
left = max(self.l, other.l) | ||
top = max(self.t, other.t) | ||
right = min(self.r, other.r) | ||
bottom = min(self.b, other.b) | ||
|
||
# Calculate intersection dimensions | ||
width = right - left | ||
height = bottom - top | ||
|
||
# If the bounding boxes do not overlap, width or height will be negative | ||
if width <= 0 or height <= 0: | ||
return 0.0 | ||
|
||
return width * height | ||
|
||
def to_bottom_left_origin(self, page_height) -> "BoundingBox": | ||
if self.coord_origin == CoordOrigin.BOTTOMLEFT: | ||
return self | ||
elif self.coord_origin == CoordOrigin.TOPLEFT: | ||
return BoundingBox( | ||
l=self.l, | ||
r=self.r, | ||
t=page_height - self.t, | ||
b=page_height - self.b, | ||
coord_origin=CoordOrigin.BOTTOMLEFT, | ||
) | ||
|
||
def to_top_left_origin(self, page_height): | ||
if self.coord_origin == CoordOrigin.TOPLEFT: | ||
return self | ||
elif self.coord_origin == CoordOrigin.BOTTOMLEFT: | ||
return BoundingBox( | ||
l=self.l, | ||
r=self.r, | ||
t=page_height - self.t, # self.b | ||
b=page_height - self.b, # self.t | ||
coord_origin=CoordOrigin.TOPLEFT, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
from typing import Any, List, Dict, Optional, Tuple, Union | ||
|
||
from pydantic import BaseModel, Field, AnyUrl | ||
|
||
from docling_core.types.newdoc.base import Size, BoundingBox | ||
|
||
class FigureData(BaseModel): # TBD | ||
pass | ||
|
||
class TableData(BaseModel): # TBD | ||
pass | ||
|
||
class RefItem(BaseModel): | ||
cref: str = Field(alias="$ref") | ||
|
||
def resolve(self, doc: "DoclingDocument"): | ||
_, path, index = self.cref.split("/") | ||
index = int(index) | ||
obj = doc.__getattribute__(path)[index] | ||
return obj | ||
|
||
class ImageRef(BaseModel): | ||
format: str # png, etc. | ||
dpi: int # ... | ||
size: Size | ||
uri: AnyUrl | ||
|
||
|
||
class ProvenanceItem(BaseModel): | ||
page_no: int | ||
bbox: BoundingBox | ||
charspan: Tuple[int, int] | ||
|
||
|
||
class DocItem(BaseModel): | ||
dloc: str # format spec ({document_hash}{json-path}) | ||
hash: int | ||
label: str | ||
parent: Optional[RefItem] | ||
children: List[RefItem] | ||
prov: List[ProvenanceItem] | ||
|
||
class TextItem(DocItem): | ||
orig: str # untreated representation | ||
text: str # sanitized representation | ||
|
||
class FloatingItem(DocItem): | ||
caption: Optional[Union[RefItem, TextItem]] | ||
references: List[Union[RefItem, TextItem]] | ||
footnotes: List[Union[RefItem, TextItem]] | ||
data: Any | ||
image: Optional[ImageRef] | ||
|
||
class FigureItem(DocItem): | ||
data: FigureData | ||
|
||
class TableItem(DocItem): | ||
data: TableData | ||
|
||
class KeyValueItem(DocItem): | ||
pass | ||
|
||
ContentItem = Union[TextItem, FigureItem, TableItem, KeyValueItem] | ||
|
||
class DocumentContent(BaseModel): | ||
furniture: List[Union[RefItem, ContentItem]] = [] | ||
body: List[Union[RefItem, ContentItem]] = [] | ||
texts: List[TextItem] = [] | ||
figures: List[FigureItem] = [] | ||
tables: List[TableItem] = [] | ||
key_value_items: List[KeyValueItem] = [] | ||
|
||
class PageItem(DocumentContent): | ||
hash: str # page hash | ||
size: Size | ||
image: Optional[ImageRef] | ||
num_elements: int | ||
|
||
class DoclingDocument(DocumentContent): | ||
description: Any | ||
file_info: Any | ||
pages: Dict[int, PageItem] = {} # empty as default |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
--- | ||
## Document with content + layout info | ||
description: { } # DescriptionType - TBD | ||
file_info: # FileInfoType - TBD | ||
document_hash: e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5 | ||
furniture: # Headers, footers, framing, navigation elements, all other non-body text | ||
- $ref: "/texts/0" | ||
|
||
body: # All elements in other arrays, by-reference only | ||
- $ref: "/texts/1" | ||
- $ref: "/figure/0" | ||
- $ref: "/texts/2" | ||
- $ref: "/texts/3" | ||
- $ref: "/tables/0" | ||
|
||
texts: # All elements that have a text-string representation, with actual data | ||
- orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022" | ||
text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022" | ||
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/0" | ||
hash: 132103230 | ||
label: "page_header" | ||
parent: null | ||
children: [ ] | ||
prov: | ||
- page_no: 1 | ||
bbox: | ||
l: 21.3 | ||
t: 52.3 | ||
b: 476.2 | ||
r: 35.2 | ||
charspan: [ 1,423 ] # 2-tuple, references to "orig" | ||
- orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis" | ||
text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" | ||
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/1" | ||
hash: 2349732 # uint64 hash of dloc | ||
label: "title" | ||
parent: null | ||
children: [ ] | ||
prov: # must exist, can be empty | ||
- page_no: 1 | ||
bbox: | ||
l: 65.0 | ||
t: 30.1 | ||
b: 53.4 | ||
r: 623.2 | ||
charspan: [ 1,423 ] # 2-tuple, references to "orig" | ||
- orig: "OPERATION (cont.)" # nested inside the figure | ||
text: "OPERATION (cont.)" | ||
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/2" | ||
hash: 6978483 | ||
label: "section_header" | ||
parent: | ||
$ref: "/figures/0" | ||
children: [ ] | ||
prov: | ||
- page_no: 1 | ||
bbox: | ||
l: 323.0 | ||
t: 354.3 | ||
b: 334.4 | ||
r: 376.0 | ||
charspan: [ 0,734 ] | ||
- orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure | ||
text: "Figure 1: Four examples of complex page layouts across different document categories" | ||
dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/texts/3" | ||
hash: 6978483 | ||
label: "caption" | ||
parent: | ||
$ref: "/figures/0" | ||
children: [ ] | ||
prov: | ||
- page_no: 1 | ||
bbox: | ||
l: 323.0 | ||
t: 354.3 | ||
b: 334.4 | ||
r: 376.0 | ||
coord_origin: "BOTTOMLEFT" | ||
charspan: [ 1,423 ] # 2-tuple, references to "orig" | ||
|
||
|
||
tables: # All tables... | ||
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/table/0" | ||
hash: 98574 | ||
label: "table" | ||
parent: null | ||
children: [ ] | ||
caption: | ||
$ref: "/texts/3" | ||
references: | ||
- $ref: "/text/??" | ||
footnotes: | ||
- $ref: "/text/??" | ||
image: | ||
format: png | ||
dpi: 72 | ||
size: | ||
width: 231 | ||
height: 351 | ||
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/tables/0.png" | ||
#alternatives: base64 encoded striong | ||
data: # TableData Type | ||
grid: [ [ ] ] # list-of-list of TableCell type | ||
otsl: "<fcel><ecel>..." # OTSL token string | ||
html: "" # ?? | ||
prov: | ||
- page_no: 1 | ||
bbox: | ||
l: 323.0 | ||
t: 354.3 | ||
b: 334.4 | ||
r: 376.0 | ||
coord_origin: "BOTTOMLEFT" | ||
charspan: [ 1,423 ] # 2-tuple, references to "orig" | ||
|
||
figures: # All figures... | ||
- dloc: "e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5#/figures/0" | ||
hash: 7782482 | ||
label: "figure" | ||
parent: null | ||
caption: | ||
$ref: "/texts/2" | ||
references: | ||
- $ref: "/text/??" | ||
footnotes: | ||
- $ref: "/text/??" | ||
|
||
data: # FigureData Type | ||
classification: "illustration" | ||
confidence: 0.78 | ||
description: "...." | ||
# content structure? | ||
image: | ||
format: png | ||
dpi: 72 | ||
size: | ||
width: 231 | ||
height: 351 | ||
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/figures/0.png" | ||
#alternatives: base64 encoded striong | ||
children: | ||
- $ref: "/texts/2" | ||
prov: | ||
- page_no: 1 | ||
bbox: | ||
l: 456.3 | ||
t: 145.8 | ||
b: 623.4 | ||
r: 702.5 | ||
charspan: [ 0,288 ] | ||
|
||
key_value_items: [ ] # All KV-items | ||
|
||
# We should consider this for pages | ||
pages: # Optional, for layout documents | ||
1: | ||
hash: "5b0916ed3ead46e69efcddb2c932afd91d0e25ce6828c39e5617e6ee2bd0cf6e" | ||
size: | ||
width: 768.23 | ||
height: 583.15 | ||
image: | ||
format: png | ||
dpi: 144 | ||
size: | ||
width: 1536 | ||
height: 1166 | ||
uri: "file:///e6fc0db2ee6e7165e93c8286ec52e0d19dfa239c2bddcfe96e64dae3de6190b5/pages/1.png" | ||
#alternatives: base64 encoded string | ||
num_elements: 23 |
Oops, something went wrong.