6
6
"""Token-aware chunker implementation leveraging the document structure."""
7
7
8
8
import warnings
9
- from dataclasses import dataclass
10
9
from typing import Iterable , Iterator , Optional , Union
11
10
12
11
import semchunk
13
- from pydantic import PositiveInt , TypeAdapter , model_validator
12
+ from pydantic import BaseModel , ConfigDict , PositiveInt , TypeAdapter , model_validator
14
13
from transformers import PreTrainedTokenizerBase
15
14
from typing_extensions import Self
16
15
@@ -35,10 +34,7 @@ class TokenAwareChunker(BaseChunker):
35
34
merge_peers: Whether to merge undersized chunks sharing same relevant metadata
36
35
"""
37
36
38
- class Config :
39
- """Pydantic config class."""
40
-
41
- arbitrary_types_allowed = True
37
+ model_config = ConfigDict (arbitrary_types_allowed = True )
42
38
43
39
tokenizer : PreTrainedTokenizerBase
44
40
max_tokens : int = None # type: ignore[assignment]
@@ -49,7 +45,6 @@ class Config:
49
45
@model_validator (mode = "after" )
50
46
def _patch_max_tokens (self ) -> Self :
51
47
if self .max_tokens is None :
52
- print (f"{ self .tokenizer .model_max_length = } " )
53
48
self .max_tokens = TypeAdapter (PositiveInt ).validate_python (
54
49
self .tokenizer .model_max_length
55
50
)
@@ -65,8 +60,7 @@ def _count_tokens(self, text: Optional[Union[str, list[str]]]):
65
60
return total
66
61
return len (self .tokenizer .tokenize (text , max_length = None ))
67
62
68
- @dataclass
69
- class _ChunkLengthInfo :
63
+ class _ChunkLengthInfo (BaseModel ):
70
64
total_len : int
71
65
text_len : int
72
66
other_len : int
@@ -200,6 +194,7 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
200
194
chunk = chunks [window_end ]
201
195
lengths = self ._doc_chunk_length (chunk )
202
196
headings_and_captions = (chunk .meta .headings , chunk .meta .captions )
197
+ ready_to_append = False
203
198
if window_start == window_end :
204
199
# starting a new block of chunks to potentially merge
205
200
current_headings_and_captions = headings_and_captions
@@ -221,6 +216,9 @@ def _merge_chunks_with_matching_metadata(self, chunks: list[DocChunk]):
221
216
window_items = window_items + chunk .meta .doc_items
222
217
window_end += 1
223
218
else :
219
+ ready_to_append = True
220
+
221
+ if ready_to_append or window_end == num_chunks :
224
222
# no more room OR the start of new metadata. Either way, end the block
225
223
# and use the current window_end as the start of a new block
226
224
if window_start + 1 == window_end :
0 commit comments