Skip to content

Commit

Permalink
fix: align chunk ref format with one used in Document (#37)
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas authored Oct 7, 2024
1 parent 4496b44 commit b5592ad
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 46 deletions.
27 changes: 25 additions & 2 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,40 @@
from abc import ABC, abstractmethod
from typing import Iterator, Optional

from pydantic import BaseModel
from pydantic import BaseModel, model_validator

from docling_core.types import BoundingBox, Document


def _create_path(pos: int, path_prefix: str = "main-text") -> str:
return f"#/{path_prefix}/{pos}"


class Chunk(BaseModel):
"""Data model for Chunk."""

path: str
text: str
heading: Optional[str] = None

@model_validator(mode="before")
@classmethod
def _json_pointer_from_json_path(cls, data):
path = data.get("path")
if path.startswith("$."):
parts = path.split("[")
data["path"] = _create_path(
pos=parts[1][:-1],
path_prefix=parts[0][2:],
)
return data


class ChunkWithMetadata(Chunk):
"""Data model for Chunk including metadata."""

page: Optional[int] = None
bbox: Optional[BoundingBox] = None
heading: Optional[str] = None


class BaseChunker(BaseModel, ABC):
Expand All @@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
Iterator[Chunk]: iterator over extracted chunks
"""
raise NotImplementedError()

@classmethod
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
return _create_path(
pos=pos,
path_prefix=path_prefix,
)
28 changes: 14 additions & 14 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from typing import Any, Iterator, Optional, Union

import pandas as pd
from pydantic import BaseModel, PositiveInt
from pydantic import BaseModel, Field, PositiveInt

from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
from docling_core.types import BaseText
Expand All @@ -25,9 +25,17 @@
class HierarchicalChunker(BaseChunker):
"""Chunker implementation leveraging the document layout."""

include_metadata: bool = True
heading_as_metadata: bool = False
min_chunk_len: PositiveInt = 64
heading_as_metadata: bool = Field(
default=False,
description="Whether heading should be in metadata (instead of text)",
)
include_metadata: bool = Field(
default=True,
description="Whether to include extras in the metadata",
)
min_chunk_len: PositiveInt = Field(
default=64, description="Minimum chunk text length to consider (in chars)"
)

class _NodeType(str, Enum):
PARAGRAPH = "paragraph"
Expand Down Expand Up @@ -83,10 +91,6 @@ def _triplet_serialize(cls, table) -> Optional[str]:

return output_text

@classmethod
def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
return f"$.{path_prefix}[{pos}]"

class _MainTextItemNode(BaseModel):
parent: Optional[int] = None
children: list[int] = []
Expand Down Expand Up @@ -304,14 +308,15 @@ def _build_chunk(
return ChunkWithMetadata(
text=concat,
path=path,
heading=heading,
page=item.prov[0].page if item.prov else None,
bbox=item.prov[0].bbox if item.prov else None,
heading=heading,
)
else:
return Chunk(
text=concat,
path=path,
heading=heading,
)
else:
return None
Expand All @@ -327,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
Yields:
Iterator[Chunk]: iterator over extracted chunks
"""
if (not self.include_metadata) and self.heading_as_metadata:
raise RuntimeError(
"To enable `heading_as_metadata`, also `include_metadata` must be True."
)

if dl_doc.main_text:
# extract doc structure incl. metadata for
# each item (e.g. parent, children)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"root": [
{
"path": "$.main-text[0]",
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
"page": 1,
"bbox": [
Expand All @@ -12,40 +12,40 @@
]
},
{
"path": "$.main-text[4]",
"path": "#/main-text/4",
"text": "This one should also include the subtitle above since it is long enough.",
"heading": "Some subtitle",
"page": 3,
"bbox": [
5.0,
6.0,
7.0,
8.0
],
"heading": "Some subtitle"
]
},
{
"path": "$.tables[0]",
"path": "#/tables/0",
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"heading": "Acquisitions",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
]
},
{
"path": "$.main-text[8]",
"path": "#/main-text/8",
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"heading": "Acquisitions",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
]
}
]
}
23 changes: 23 additions & 0 deletions test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{
"root": [
{
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk."
},
{
"path": "#/main-text/4",
"text": "This one should also include the subtitle above since it is long enough.",
"heading": "Some subtitle"
},
{
"path": "#/tables/0",
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"heading": "Acquisitions"
},
{
"path": "#/main-text/8",
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"heading": "Acquisitions"
}
]
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"root": [
{
"path": "$.main-text[0]",
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
"page": 1,
"bbox": [
Expand All @@ -12,7 +12,7 @@
]
},
{
"path": "$.main-text[4]",
"path": "#/main-text/4",
"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.",
"page": 3,
"bbox": [
Expand All @@ -23,7 +23,7 @@
]
},
{
"path": "$.tables[0]",
"path": "#/tables/0",
"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"page": 4,
"bbox": [
Expand All @@ -34,7 +34,7 @@
]
},
{
"path": "$.main-text[7]",
"path": "#/main-text/7",
"text": "Acquisitions\nThis paragraph should actually include the latest subtitle.",
"page": 4,
"bbox": [
Expand All @@ -45,7 +45,7 @@
]
},
{
"path": "$.main-text[8]",
"path": "#/main-text/8",
"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"page": 4,
"bbox": [
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
{
"root": [
{
"path": "$.main-text[0]",
"path": "#/main-text/0",
"text": "This paragraph is marginally long enough for getting accepted as a chunk."
},
{
"path": "$.main-text[4]",
"path": "#/main-text/4",
"text": "Some subtitle\nThis one should also include the subtitle above since it is long enough."
},
{
"path": "$.tables[0]",
"path": "#/tables/0",
"text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany"
},
{
"path": "$.main-text[7]",
"path": "#/main-text/7",
"text": "Acquisitions\nThis paragraph should actually include the latest subtitle."
},
{
"path": "$.main-text[8]",
"path": "#/main-text/8",
"text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here."
}
]
Expand Down
32 changes: 22 additions & 10 deletions test/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,49 @@
from docling_core.types import Document as DLDocument


def test_chunk_without_metadata():
def test_chunk_heading_in_text_wout_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=False)
chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump() for n in chunks])
with open("test/data/chunker/0_out_chunks_wout_meta.json") as f:
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_heading_in_text_with_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_with_metadata_heading_in_text():
def test_chunk_heading_in_meta_wout_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_with_metadata_incl_heading():
def test_chunk_heading_in_meta_with_extras():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data

0 comments on commit b5592ad

Please sign in to comment.