-
Notifications
You must be signed in to change notification settings - Fork 0
/
splitter.py
31 lines (24 loc) · 1.06 KB
/
splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
from typing import List
from langchain.schema import Document
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
def langchain_document_to_dict(doc: Document):
"""
Converts langchain Document to dictionary
"""
return {"page_content": doc.page_content, "metadata": doc.metadata}
def dict_to_langchain_document(doc: dict):
"""
Converts dictionary to Langchain docuemnt
"""
return Document(page_content=doc["page_content"], metadata=doc["metadata"])
def get_split_documents_using_token_based(model_name: str, documents: List[dict], chunk_size: int, chunk_overlap: int):
"""
Splits documents into multiple chunks using Sentence Transformer
token based.
"""
splitter = SentenceTransformersTokenTextSplitter(
chunk_overlap=chunk_overlap, model_name=model_name, tokens_per_chunk=chunk_size
)
langchain_docs = [dict_to_langchain_document(d) for d in documents]
splitted_docs = splitter.split_documents(documents=langchain_docs)
return [langchain_document_to_dict(d) for d in splitted_docs]