11# Standard
2- from abc import ABC , abstractmethod
32from collections import defaultdict
4- from enum import Enum
53from pathlib import Path
6- from typing import DefaultDict , Iterable , List , Optional , Tuple
4+ from typing import Dict , Iterable , List , Optional
75import json
86import logging
97import re
2624
2725logger = logging .getLogger (__name__ )
2826_DEFAULT_CHUNK_OVERLAP = 100
27+ SUPPORTED_FILETYPES = [".pdf" , ".md" ]
2928
3029
3130def _num_tokens_from_words (num_words ) -> int :
@@ -68,186 +67,55 @@ def resolve_ocr_options() -> OcrOptions:
6867 return None
6968
7069
71- class FileTypes (Enum ):
72- MD = ".md"
73- PDF = ".pdf"
70+ def split_docs_by_filetype (document_paths : List [Path ]) -> Dict [str , List [Path ]]:
71+ """Split document paths into a dict of lists based on their file extension."""
72+ document_dict = defaultdict (list )
73+ for path in document_paths :
74+ filetype = path .suffix
75+ if filetype not in SUPPORTED_FILETYPES :
76+ raise ValueError (f"Provided unsupported filetype { filetype } " )
7477
78+ document_dict [filetype ].append (path )
7579
76- class ChunkerBase (ABC ):
77- @abstractmethod
78- def chunk_documents (self ):
79- pass
80-
81-
82- class DocumentChunker :
83- """A factory chunker class that instantiates the applicable chunker
84-
85- Currently, only Markdown and PDF are supported. For Markdown, returns
86- TextSplitChunker, and for PDF, returns ContextAwareChunker"""
87-
88- def __new__ (
89- cls ,
90- leaf_node ,
91- taxonomy_path ,
92- output_dir : Path ,
93- server_ctx_size = 4096 ,
94- chunk_word_count = 1024 ,
95- tokenizer_model_name : Optional [str ] = None ,
96- docling_model_path : Optional [str ] = None ,
97- ):
98- """Insantiate the appropriate chunker for the provided document
99-
100- Args:
101- leaf_node: a leaf node dict containing "documents",
102- "filepaths", and "taxonomy_path" keys
103- output_dir (Path): directory where artifacts should be stored
104- server_ctx_size (int): Context window size of server
105- chunk_word_count (int): Maximum number of words to chunk a document
106- tokenizer_model_name (Optional[str]): name of huggingface model to get
107- tokenizer from
108- Returns:
109- TextSplitChunker | ContextAwareChunker: Object of the appropriate
110- chunker class for the provided filetype
111- """
112- documents = leaf_node [0 ]["documents" ]
113-
114- if not isinstance (taxonomy_path , Path ):
115- taxonomy_path = Path (taxonomy_path )
80+ return dict (document_dict )
11681
117- if isinstance (documents , str ):
118- documents = [documents ]
119- logger .info (
120- "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
121- )
122- elif not isinstance (documents , list ):
123- raise TypeError (
124- "Expected: documents to be a list, but got {}" .format (type (documents ))
125- )
126-
127- filepaths = leaf_node [0 ]["filepaths" ]
128-
129- doc_dict = cls ._split_docs_by_filetype (documents , filepaths )
130- if len (doc_dict .keys ()) > 1 :
131- raise ValueError ("Received multiple document types" )
132- if len (doc_dict .keys ()) < 1 :
133- raise ValueError ("Received no document types" )
134-
135- if FileTypes .MD in doc_dict :
136- doc_contents = [d for d , _ in doc_dict [FileTypes .MD ]]
137- return TextSplitChunker (
138- doc_contents ,
139- server_ctx_size ,
140- chunk_word_count ,
141- output_dir ,
142- )
143-
144- if FileTypes .PDF in doc_dict :
145- doc_paths = [p for _ , p in doc_dict [FileTypes .PDF ]]
146- return ContextAwareChunker (
147- doc_paths ,
148- filepaths ,
149- output_dir ,
150- chunk_word_count ,
151- tokenizer_model_name ,
152- docling_model_path = docling_model_path ,
153- )
15482
155- @staticmethod
156- def _split_docs_by_filetype (
157- documents : List [str ], filepaths : List [Path ]
158- ) -> DefaultDict [FileTypes , List [Tuple [str , Path ]]]:
159- """Separate documents into lists based on their filetype.
160-
161- Currently, only Markdown and PDF are supported.
162- Args:
163- documents (List[str]): A list of the document contents as strings
164- filepaths (List[Path]): Corresponding document filepaths
165- Returns:
166- DefaultDict: Dictionary with either ".md" or ".pdf" as a key.
167- Markdown items contain document contents, PDF items contain
168- paths to documents.
169- """
170- doc_dict = defaultdict (list )
171- for doc , path in zip (documents , filepaths ):
172- if path .suffix == ".md" :
173- # append doc contents
174- doc_dict [FileTypes .MD ].append ((doc , path ))
175- elif path .suffix == ".pdf" :
176- # append doc paths
177- doc_dict [FileTypes .PDF ].append ((doc , path ))
178- else :
179- raise ValueError (
180- f"Received document of type .{ path .suffix } , which is not a supported filetype"
181- )
182- return doc_dict
183-
184-
185- class TextSplitChunker (ChunkerBase ):
83+ class DocumentChunker : # pylint: disable=too-many-instance-attributes
18684 def __init__ (
18785 self ,
188- document_contents : List | str ,
189- server_ctx_size : int ,
190- chunk_word_count : int ,
86+ document_paths : List [Path ],
19187 output_dir : Path ,
88+ tokenizer_model_name : str | Path ,
89+ docling_model_path : Optional [Path ] = None ,
90+ server_ctx_size : int = 4096 ,
91+ chunk_word_count : int = 1024 ,
19292 ):
193- self .document_contents = document_contents
194- self .server_ctx_size = server_ctx_size
195- self .chunk_word_count = chunk_word_count
196- self .output_dir = output_dir
93+ if not document_paths :
94+ raise ValueError ("Provided empty list of documents" )
19795
198- def chunk_documents (self ) -> List :
199- """Naively chunk markdown documents based on the word count provided by the user.
200- Returns:
201- List[str]: List of chunked documents.
202- """
203- num_tokens_per_doc = _num_tokens_from_words (self .chunk_word_count )
204- if num_tokens_per_doc > int (self .server_ctx_size - 1024 ):
205- raise ValueError (
206- "Error: {}" .format (
207- str (
208- f"Given word count ({ self .chunk_word_count } ) per doc will exceed the server context window size ({ self .server_ctx_size } )"
209- )
210- )
211- )
212- if self .document_contents == []:
213- return []
96+ document_dict = split_docs_by_filetype (document_paths )
21497
215- chunk_size = _num_chars_from_tokens ( num_tokens_per_doc )
216- return chunk_markdowns ( self . document_contents , chunk_size )
98+ if len ( document_dict ) > 1 :
99+ raise ValueError ( "Provided multiple document types" )
217100
101+ # We know there is only 1 key, value pair, so we take the first
102+ self .document_filetype , self .document_paths = next (iter (document_dict .items ()))
103+ self .docling_model_path = docling_model_path
104+ self .converter = self ._init_docling_converter ()
218105
219- class ContextAwareChunker (ChunkerBase ): # pylint: disable=too-many-instance-attributes
220- def __init__ (
221- self ,
222- document_paths ,
223- filepaths ,
224- output_dir : Path ,
225- chunk_word_count : int ,
226- tokenizer_model_name : Optional [str ],
227- docling_model_path = None ,
228- ):
229- self .document_paths = document_paths
230- self .filepaths = filepaths
231106 self .output_dir = self ._path_validator (output_dir )
107+ self .server_ctx_size = server_ctx_size
232108 self .chunk_word_count = chunk_word_count
233109 self .tokenizer = self .create_tokenizer (tokenizer_model_name )
234- self .docling_model_path = docling_model_path
235-
236- def chunk_documents (self ) -> List :
237- """Semantically chunk PDF documents.
238110
239- Returns:
240- List: a list of chunks from the documents
241- """
111+ def _init_docling_converter (self ):
112+ """Initialize docling converter with filetype-specific configurations"""
242113 # triggers torch loading, import lazily
243114 # pylint: disable=import-outside-toplevel
244115 # Third Party
245116 from docling .document_converter import DocumentConverter , PdfFormatOption
246117 from docling .pipeline .standard_pdf_pipeline import StandardPdfPipeline
247118
248- if self .document_paths == []:
249- return []
250-
251119 if self .docling_model_path is None :
252120 logger .info ("Docling models not found on disk, downloading models..." )
253121 self .docling_model_path = StandardPdfPipeline .download_models_hf ()
@@ -258,17 +126,29 @@ def chunk_documents(self) -> List:
258126 artifacts_path = self .docling_model_path ,
259127 do_ocr = False ,
260128 )
129+
261130 ocr_options = resolve_ocr_options ()
262131 if ocr_options is not None :
263132 pipeline_options .do_ocr = True
264133 pipeline_options .ocr_options = ocr_options
265134
266- converter = DocumentConverter (
135+ return DocumentConverter (
267136 format_options = {
268137 InputFormat .PDF : PdfFormatOption (pipeline_options = pipeline_options )
269138 }
270139 )
271- parsed_documents = converter .convert_all (self .filepaths )
140+
141+ def chunk_documents (self ) -> List :
142+ """Split a list of documents into chunks
143+
144+ Returns:
145+ List: a list of chunks from the documents
146+ """
147+
148+ if self .document_paths == []:
149+ return []
150+
151+ parsed_documents = self .converter .convert_all (self .document_paths )
272152
273153 docling_artifacts_path = self .export_documents (parsed_documents )
274154
@@ -348,7 +228,7 @@ def fuse_texts(
348228 return fused_texts
349229
350230 @staticmethod
351- def create_tokenizer (model_name : Optional [ str ] ):
231+ def create_tokenizer (model_path : str | Path ):
352232 """
353233 Create a tokenizer instance from a pre-trained model or a local directory.
354234
@@ -363,10 +243,8 @@ def create_tokenizer(model_name: Optional[str]):
363243 # Third Party
364244 from transformers import AutoTokenizer
365245
366- if model_name is None :
367- raise TypeError ("No model path provided" )
368-
369- model_path = Path (model_name )
246+ if not isinstance (model_path , Path ):
247+ model_path = Path (model_path )
370248 error_info_message = (
371249 "Please run `ilab model download {download_args}` and try again"
372250 )
@@ -486,7 +364,7 @@ def get_table_page_number(self, json_book, idx):
486364 prev_page_num = book_element ["prov" ][0 ]["page" ]
487365 break
488366 for book_element in json_book ["main-text" ][idx :]:
489- if "prov" in book_element :
367+ if "prov" in book_element and book_element [ "prov" ] :
490368 next_page_num = book_element ["prov" ][0 ]["page" ]
491369 break
492370 if prev_page_num is not None and next_page_num is not None :
@@ -545,8 +423,14 @@ def build_chunks_from_docling_json(
545423 current_book_page_number = self .get_table_page_number (
546424 json_book , idx
547425 )
426+ book_text = self .get_table (json_book , book_element ["$ref" ])
427+ elif book_element ["prov" ]:
428+ current_book_page_number = book_element ["prov" ][0 ][
429+ "page"
430+ ] # TODO export to function to handle empty ["prov"]
431+ book_text = book_element ["text" ]
548432 else :
549- current_book_page_number = book_element [ "prov" ][ 0 ][ "page" ]
433+ current_book_page_number = None
550434 book_text = book_element ["text" ]
551435
552436 if book_element ["type" ] == "subtitle-level-1" :
@@ -599,8 +483,6 @@ def build_chunks_from_docling_json(
599483
600484 if book_element ["type" ] == "paragraph" :
601485 book_text = self .add_heading_formatting (book_text )
602- elif book_element ["type" ] == "table" :
603- book_text = self .get_table (json_book , book_element ["$ref" ])
604486 if "## References" in book_text or "## Acknowledgements" in book_text :
605487 # For research papers we ignore everything after this sections
606488 break
0 commit comments