Skip to content

Commit

Permalink
fix handling of last chunk, minor improvements
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Dec 3, 2024
1 parent a230cec commit 4c03504
Showing 1 changed file with 7 additions and 9 deletions.
16 changes: 7 additions & 9 deletions docling_core/transforms/chunker/token_aware_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,10 @@
"""Token-aware chunker implementation leveraging the document structure."""

import warnings
from dataclasses import dataclass
from typing import Iterable, Iterator, Optional, Union

import semchunk
from pydantic import PositiveInt, TypeAdapter, model_validator
from pydantic import BaseModel, ConfigDict, PositiveInt, TypeAdapter, model_validator
from transformers import PreTrainedTokenizerBase
from typing_extensions import Self

Expand All @@ -35,10 +34,7 @@ class TokenAwareChunker(BaseChunker):
merge_peers: Whether to merge undersized chunks sharing same relevant metadata
"""

class Config:
"""Pydantic config class."""

arbitrary_types_allowed = True
model_config = ConfigDict(arbitrary_types_allowed=True)

tokenizer: PreTrainedTokenizerBase
max_tokens: int = None # type: ignore[assignment]
Expand All @@ -49,7 +45,6 @@ class Config:
@model_validator(mode="after")
def _patch_max_tokens(self) -> Self:
if self.max_tokens is None:
print(f"{self.tokenizer.model_max_length=}")
self.max_tokens = TypeAdapter(PositiveInt).validate_python(
self.tokenizer.model_max_length
)
Expand All @@ -65,8 +60,7 @@ def _count_tokens(self, text: Optional[Union[str, list[str]]]):
return total
return len(self.tokenizer.tokenize(text, max_length=None))

@dataclass
class _ChunkLengthInfo:
class _ChunkLengthInfo(BaseModel):
total_len: int
text_len: int
other_len: int
Expand Down Expand Up @@ -200,6 +194,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
chunk = chunks[window_end]
lengths = self._doc_chunk_length(chunk)
headings_and_captions = (chunk.meta.headings, chunk.meta.captions)
ready_to_append = False
if window_start == window_end:
# starting a new block of chunks to potentially merge
current_headings_and_captions = headings_and_captions
Expand All @@ -221,6 +216,9 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
window_items = window_items + chunk.meta.doc_items
window_end += 1
else:
ready_to_append = True

if ready_to_append or window_end == num_chunks:
# no more room OR the start of new metadata. Either way, end the block
# and use the current window_end as the start of a new block
if window_start + 1 == window_end:
Expand Down

0 comments on commit 4c03504

Please sign in to comment.