Skip to content

Commit 4c03504

Browse files
committed
fix handling of last chunk, minor improvements
Signed-off-by: Panos Vagenas <[email protected]>
1 parent a230cec commit 4c03504

File tree

1 file changed

+7
-9
lines changed

1 file changed

+7
-9
lines changed

docling_core/transforms/chunker/token_aware_chunker.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,10 @@
66
"""Token-aware chunker implementation leveraging the document structure."""
77

88
import warnings
9-
from dataclasses import dataclass
109
from typing import Iterable, Iterator, Optional, Union
1110

1211
import semchunk
13-
from pydantic import PositiveInt, TypeAdapter, model_validator
12+
from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
1413
from transformers import PreTrainedTokenizerBase
1514
from typing_extensions import Self
1615

@@ -35,10 +34,7 @@ class TokenAwareChunker(BaseChunker):
3534
merge_peers: Whether to merge undersized chunks sharing same relevant metadata
3635
"""
3736

38-
class Config:
39-
"""Pydantic config class."""
40-
41-
arbitrary_types_allowed = True
37+
model_config = ConfigDict(arbitrary_types_allowed=True)
4238

4339
tokenizer: PreTrainedTokenizerBase
4440
max_tokens: int = None # type: ignore[assignment]
@@ -49,7 +45,6 @@ class Config:
4945
@model_validator(mode="after")
5046
def _patch_max_tokens(self) -> Self:
5147
if self.max_tokens is None:
52-
print(f"{self.tokenizer.model_max_length=}")
5348
self.max_tokens = TypeAdapter(PositiveInt).validate_python(
5449
self.tokenizer.model_max_length
5550
)
@@ -65,8 +60,7 @@ def _count_tokens(self, text: Optional[Union[str, list[str]]]):
6560
return total
6661
return len(self.tokenizer.tokenize(text, max_length=None))
6762

68-
@dataclass
69-
class _ChunkLengthInfo:
63+
class _ChunkLengthInfo(BaseModel):
7064
total_len: int
7165
text_len: int
7266
other_len: int
@@ -200,6 +194,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
200194
chunk = chunks[window_end]
201195
lengths = self._doc_chunk_length(chunk)
202196
headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
197+
ready_to_append = False
203198
if window_start == window_end:
204199
# starting a new block of chunks to potentially merge
205200
current_headings_and_captions = headings_and_captions
@@ -221,6 +216,9 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
221216
window_items = window_items + chunk.meta.doc_items
222217
window_end += 1
223218
else:
219+
ready_to_append = True
220+
221+
if ready_to_append or window_end == num_chunks:
224222
# no more room OR the start of new metadata. Either way, end the block
225223
# and use the current window_end as the start of a new block
226224
if window_start + 1 == window_end:

0 commit comments

Comments
 (0)