Skip to content

Commit 9c48164

Browse files
committed
removed dependency to chonkie
1 parent 67d8c39 commit 9c48164

File tree

3 files changed

+19
-31
lines changed

3 files changed

+19
-31
lines changed

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
"type": "DocumentType"
1313
}
1414
],
15-
"dependencies": []
15+
"dependencies": ["ExtensityAI/chonkie-symai"]
1616
}

requirements.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
pydantic
22
nest_asyncio
3-
tenacity
4-
chonkie
3+
tenacity

src/hierarchical.py

+17-28
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,29 @@
77
from textwrap import dedent
88
from typing import List, Optional
99

10-
from chonkie import BaseChunker, BaseEmbeddings
10+
import nest_asyncio
11+
from loguru import logger
12+
from pydantic import Field, field_validator
13+
from symai import Import, Symbol
14+
from symai.components import FileReader, Function
15+
from symai.core_ext import bind
16+
from symai.models import LLMDataModel
1117
from tenacity import (
1218
before_sleep_log,
1319
retry,
20+
retry_if_exception_type,
1421
stop_after_attempt,
1522
wait_exponential_jitter,
16-
retry_if_exception_type,
1723
)
18-
import nest_asyncio
19-
from loguru import logger
20-
from pydantic import Field, field_validator
21-
import tiktoken
2224
from tiktoken import Encoding
2325
from tokenizers import Tokenizer
24-
from chonkie import RecursiveChunker
25-
from symai.components import FileReader, Function
26-
from symai.core_ext import bind
27-
from symai.models import LLMDataModel
2826

2927
from .functions import ValidatedFunction
3028
from .types import TYPE_SPECIFIC_PROMPTS, DocumentType
3129

30+
# Load the chunker
31+
ChonkieChunker = Import.load_expression("ExtensityAI/chonkie-symai", "ChonkieChunker")
32+
3233

3334
class Summary(LLMDataModel):
3435
summary: str = Field(
@@ -112,8 +113,8 @@ def __init__(
112113
max_output_tokens: int = 10000,
113114
user_prompt: str = None,
114115
include_quotes: bool = False,
115-
tokenizer: str | BaseEmbeddings | Encoding = "gpt2",
116-
chunker: BaseChunker = RecursiveChunker,
116+
tokenizer_name: str = "gpt2",
117+
chunker_name: str = "RecursiveChunker",
117118
seed: int = 42,
118119
*args,
119120
**kwargs,
@@ -152,21 +153,9 @@ def __init__(
152153
self.content = f"[[DOCUMENT::{file_name}]]: <<<\n{str(file_content)}\n>>>\n"
153154
self.content_only = str(file_content)
154155

155-
# init tokenizer
156-
if isinstance(tokenizer, str):
157-
try:
158-
self.tokenizer = tiktoken.encoding_for_model(tokenizer)
159-
except:
160-
try:
161-
self.tokenizer = Tokenizer.from_pretrained(tokenizer)
162-
except:
163-
logger.warning(
164-
f"Tokenizer {tokenizer} not found, using o200k_base tokenizer instead."
165-
)
166-
self.tokenizer = tiktoken.get_encoding('o200k_base')
167-
else:
168-
self.tokenizer = tokenizer
169-
self.chunker = chunker
156+
# init chunker
157+
self.chunker = ChonkieChunker(tokenizer_name=tokenizer_name)
158+
self.chunker_type = chunker_name
170159

171160
# Content type is unknown at initialization
172161
self.document_type = None
@@ -296,7 +285,7 @@ def split_words(self, text):
296285
def chunk_by_token_count(self, text, chunk_size, include_context=False):
297286
# prepare results
298287
logger.debug(f"Chunking with chunk size: {chunk_size}")
299-
chunks = self.chunker(self.tokenizer, chunk_size=chunk_size)(text)
288+
chunks = self.chunker(data=Symbol(text), chunker_name=self.chunker_type, chunk_size=chunk_size)
300289
logger.debug(f"Number of chunks: {len(chunks)}")
301290
return chunks
302291

0 commit comments

Comments
 (0)