fix: align chunk ref format with one used in Document (#37)

Signed-off-by: Panos Vagenas <[email protected]>
DS4SD · Oct 7, 2024 · b5592ad · b5592ad
1 parent 4496b44
commit b5592ad
Show file tree

Hide file tree

Showing 7 changed files with 104 additions and 46 deletions.
diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py
@@ -7,24 +7,40 @@
 from abc import ABC, abstractmethod
 from typing import Iterator, Optional
 
-from pydantic import BaseModel
+from pydantic import BaseModel, model_validator
 
 from docling_core.types import BoundingBox, Document
 
 
+def _create_path(pos: int, path_prefix: str = "main-text") -> str:
+    return f"#/{path_prefix}/{pos}"
+
+
 class Chunk(BaseModel):
     """Data model for Chunk."""
 
     path: str
     text: str
+    heading: Optional[str] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def _json_pointer_from_json_path(cls, data):
+        path = data.get("path")
+        if path.startswith("$."):
+            parts = path.split("[")
+            data["path"] = _create_path(
+                pos=parts[1][:-1],
+                path_prefix=parts[0][2:],
+            )
+        return data
 
 
 class ChunkWithMetadata(Chunk):
     """Data model for Chunk including metadata."""
 
     page: Optional[int] = None
     bbox: Optional[BoundingBox] = None
-    heading: Optional[str] = None
 
 
 class BaseChunker(BaseModel, ABC):
@@ -44,3 +60,10 @@ def chunk(self, dl_doc: Document, **kwargs) -> Iterator[Chunk]:
             Iterator[Chunk]: iterator over extracted chunks
         """
         raise NotImplementedError()
+
+    @classmethod
+    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
+        return _create_path(
+            pos=pos,
+            path_prefix=path_prefix,
+        )
diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py
@@ -12,7 +12,7 @@
 from typing import Any, Iterator, Optional, Union
 
 import pandas as pd
-from pydantic import BaseModel, PositiveInt
+from pydantic import BaseModel, Field, PositiveInt
 
 from docling_core.transforms.chunker import BaseChunker, Chunk, ChunkWithMetadata
 from docling_core.types import BaseText
@@ -25,9 +25,17 @@
 class HierarchicalChunker(BaseChunker):
     """Chunker implementation leveraging the document layout."""
 
-    include_metadata: bool = True
-    heading_as_metadata: bool = False
-    min_chunk_len: PositiveInt = 64
+    heading_as_metadata: bool = Field(
+        default=False,
+        description="Whether heading should be in metadata (instead of text)",
+    )
+    include_metadata: bool = Field(
+        default=True,
+        description="Whether to include extras in the metadata",
+    )
+    min_chunk_len: PositiveInt = Field(
+        default=64, description="Minimum chunk text length to consider (in chars)"
+    )
 
     class _NodeType(str, Enum):
         PARAGRAPH = "paragraph"
@@ -83,10 +91,6 @@ def _triplet_serialize(cls, table) -> Optional[str]:
 
         return output_text
 
-    @classmethod
-    def _create_path(cls, pos: int, path_prefix: str = "main-text") -> str:
-        return f"$.{path_prefix}[{pos}]"
-
     class _MainTextItemNode(BaseModel):
         parent: Optional[int] = None
         children: list[int] = []
@@ -304,14 +308,15 @@ def _build_chunk(
                 return ChunkWithMetadata(
                     text=concat,
                     path=path,
+                    heading=heading,
                     page=item.prov[0].page if item.prov else None,
                     bbox=item.prov[0].bbox if item.prov else None,
-                    heading=heading,
                 )
             else:
                 return Chunk(
                     text=concat,
                     path=path,
+                    heading=heading,
                 )
         else:
             return None
@@ -327,11 +332,6 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
         Yields:
             Iterator[Chunk]: iterator over extracted chunks
         """
-        if (not self.include_metadata) and self.heading_as_metadata:
-            raise RuntimeError(
-                "To enable `heading_as_metadata`, also `include_metadata` must be True."
-            )
-
         if dl_doc.main_text:
             # extract doc structure incl. metadata for
             # each item (e.g. parent, children)

diff --git a/.../0_out_chunks_with_meta_incl_heading.json → ...t_chunks_heading_in_meta_with_extras.json b/.../0_out_chunks_with_meta_incl_heading.json → ...t_chunks_heading_in_meta_with_extras.json
@@ -1,7 +1,7 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk.",
             "page": 1,
             "bbox": [
@@ -12,40 +12,40 @@
             ]
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "This one should also include the subtitle above since it is long enough.",
+            "heading": "Some subtitle",
             "page": 3,
             "bbox": [
                 5.0,
                 6.0,
                 7.0,
                 8.0
-            ],
-            "heading": "Some subtitle"
+            ]
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
+            "heading": "Acquisitions",
             "page": 4,
             "bbox": [
                 8.0,
                 9.0,
                 10.0,
                 11.0
-            ],
-            "heading": "Acquisitions"
+            ]
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
+            "heading": "Acquisitions",
             "page": 4,
             "bbox": [
                 8.0,
                 9.0,
                 10.0,
                 11.0
-            ],
-            "heading": "Acquisitions"
+            ]
         }
     ]
 }
diff --git a/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json b/test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json
@@ -0,0 +1,23 @@
+{
+    "root": [
+        {
+            "path": "#/main-text/0",
+            "text": "This paragraph is marginally long enough for getting accepted as a chunk."
+        },
+        {
+            "path": "#/main-text/4",
+            "text": "This one should also include the subtitle above since it is long enough.",
+            "heading": "Some subtitle"
+        },
+        {
+            "path": "#/tables/0",
+            "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
+            "heading": "Acquisitions"
+        },
+        {
+            "path": "#/main-text/8",
+            "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
+            "heading": "Acquisitions"
+        }
+    ]
+}
diff --git a/...out_chunks_with_meta_heading_in_text.json → ...t_chunks_heading_in_text_with_extras.json b/...out_chunks_with_meta_heading_in_text.json → ...t_chunks_heading_in_text_with_extras.json
@@ -1,7 +1,7 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk.",
             "page": 1,
             "bbox": [
@@ -12,7 +12,7 @@
             ]
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough.",
             "page": 3,
             "bbox": [
@@ -23,7 +23,7 @@
             ]
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
             "page": 4,
             "bbox": [
@@ -34,7 +34,7 @@
             ]
         },
         {
-            "path": "$.main-text[7]",
+            "path": "#/main-text/7",
             "text": "Acquisitions\nThis paragraph should actually include the latest subtitle.",
             "page": 4,
             "bbox": [
@@ -45,7 +45,7 @@
             ]
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
             "page": 4,
             "bbox": [

diff --git a/.../data/chunker/0_out_chunks_wout_meta.json → ...t_chunks_heading_in_text_wout_extras.json b/.../data/chunker/0_out_chunks_wout_meta.json → ...t_chunks_heading_in_text_wout_extras.json
@@ -1,23 +1,23 @@
 {
     "root": [
         {
-            "path": "$.main-text[0]",
+            "path": "#/main-text/0",
             "text": "This paragraph is marginally long enough for getting accepted as a chunk."
         },
         {
-            "path": "$.main-text[4]",
+            "path": "#/main-text/4",
             "text": "Some subtitle\nThis one should also include the subtitle above since it is long enough."
         },
         {
-            "path": "$.tables[0]",
+            "path": "#/tables/0",
             "text": "Acquisitions\nAtomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany"
         },
         {
-            "path": "$.main-text[7]",
+            "path": "#/main-text/7",
             "text": "Acquisitions\nThis paragraph should actually include the latest subtitle."
         },
         {
-            "path": "$.main-text[8]",
+            "path": "#/main-text/8",
             "text": "Acquisitions\nThis paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here."
         }
     ]

diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py
@@ -9,37 +9,49 @@
 from docling_core.types import Document as DLDocument
 
 
-def test_chunk_without_metadata():
+def test_chunk_heading_in_text_wout_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=False)
+    chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=False)
     chunks = chunker.chunk(dl_doc=dl_doc)
-    act_data = dict(root=[n.model_dump() for n in chunks])
-    with open("test/data/chunker/0_out_chunks_wout_meta.json") as f:
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_heading_in_text_wout_extras.json") as f:
+        exp_data = json.load(fp=f)
+    assert exp_data == act_data
+
+
+def test_chunk_heading_in_text_with_extras():
+    with open("test/data/chunker/0_inp_dl_doc.json") as f:
+        data_json = f.read()
+    dl_doc = DLDocument.model_validate_json(data_json)
+    chunker = HierarchicalChunker(heading_as_metadata=False, include_metadata=True)
+    chunks = chunker.chunk(dl_doc=dl_doc)
+    act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
+    with open("test/data/chunker/0_out_chunks_heading_in_text_with_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
 
 
-def test_chunk_with_metadata_heading_in_text():
+def test_chunk_heading_in_meta_wout_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
+    chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=False)
     chunks = chunker.chunk(dl_doc=dl_doc)
     act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
-    with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
+    with open("test/data/chunker/0_out_chunks_heading_in_meta_wout_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data
 
 
-def test_chunk_with_metadata_incl_heading():
+def test_chunk_heading_in_meta_with_extras():
     with open("test/data/chunker/0_inp_dl_doc.json") as f:
         data_json = f.read()
     dl_doc = DLDocument.model_validate_json(data_json)
-    chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
+    chunker = HierarchicalChunker(heading_as_metadata=True, include_metadata=True)
     chunks = chunker.chunk(dl_doc=dl_doc)
     act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
-    with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
+    with open("test/data/chunker/0_out_chunks_heading_in_meta_with_extras.json") as f:
         exp_data = json.load(fp=f)
     assert exp_data == act_data