Skip to content

Commit 13bf595

Browse files
authored
Merge pull request #430 from khaledsulayman/ks-chunking-refactor
Refactor Document Chunker to always use docling
2 parents f67e7d7 + e3a3e1e commit 13bf595

File tree

8 files changed

+317
-430
lines changed

8 files changed

+317
-430
lines changed

src/instructlab/sdg/generate_data.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -395,8 +395,7 @@ def generate_data(
395395
is_knowledge = False
396396
leaf_node_path = leaf_node[0]["taxonomy_path"].replace("->", "_")
397397
samples = leaf_node_to_samples(
398-
leaf_node,
399-
taxonomy,
398+
leaf_node, # pylint: disable=duplicate-code
400399
server_ctx_size,
401400
chunk_word_count,
402401
document_output_dir,

src/instructlab/sdg/utils/chunkers.py

Lines changed: 54 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
# Standard
2-
from abc import ABC, abstractmethod
32
from collections import defaultdict
4-
from enum import Enum
53
from pathlib import Path
6-
from typing import DefaultDict, Iterable, List, Optional, Tuple
4+
from typing import Dict, Iterable, List, Optional
75
import json
86
import logging
97
import re
@@ -26,6 +24,7 @@
2624

2725
logger = logging.getLogger(__name__)
2826
_DEFAULT_CHUNK_OVERLAP = 100
27+
SUPPORTED_FILETYPES = [".pdf", ".md"]
2928

3029

3130
def _num_tokens_from_words(num_words) -> int:
@@ -68,186 +67,55 @@ def resolve_ocr_options() -> OcrOptions:
6867
return None
6968

7069

71-
class FileTypes(Enum):
72-
MD = ".md"
73-
PDF = ".pdf"
70+
def split_docs_by_filetype(document_paths: List[Path]) -> Dict[str, List[Path]]:
71+
"""Split document paths into a dict of lists based on their file extension."""
72+
document_dict = defaultdict(list)
73+
for path in document_paths:
74+
filetype = path.suffix
75+
if filetype not in SUPPORTED_FILETYPES:
76+
raise ValueError(f"Provided unsupported filetype {filetype}")
7477

78+
document_dict[filetype].append(path)
7579

76-
class ChunkerBase(ABC):
77-
@abstractmethod
78-
def chunk_documents(self):
79-
pass
80-
81-
82-
class DocumentChunker:
83-
"""A factory chunker class that instantiates the applicable chunker
84-
85-
Currently, only Markdown and PDF are supported. For Markdown, returns
86-
TextSplitChunker, and for PDF, returns ContextAwareChunker"""
87-
88-
def __new__(
89-
cls,
90-
leaf_node,
91-
taxonomy_path,
92-
output_dir: Path,
93-
server_ctx_size=4096,
94-
chunk_word_count=1024,
95-
tokenizer_model_name: Optional[str] = None,
96-
docling_model_path: Optional[str] = None,
97-
):
98-
"""Insantiate the appropriate chunker for the provided document
99-
100-
Args:
101-
leaf_node: a leaf node dict containing "documents",
102-
"filepaths", and "taxonomy_path" keys
103-
output_dir (Path): directory where artifacts should be stored
104-
server_ctx_size (int): Context window size of server
105-
chunk_word_count (int): Maximum number of words to chunk a document
106-
tokenizer_model_name (Optional[str]): name of huggingface model to get
107-
tokenizer from
108-
Returns:
109-
TextSplitChunker | ContextAwareChunker: Object of the appropriate
110-
chunker class for the provided filetype
111-
"""
112-
documents = leaf_node[0]["documents"]
113-
114-
if not isinstance(taxonomy_path, Path):
115-
taxonomy_path = Path(taxonomy_path)
80+
return dict(document_dict)
11681

117-
if isinstance(documents, str):
118-
documents = [documents]
119-
logger.info(
120-
"Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
121-
)
122-
elif not isinstance(documents, list):
123-
raise TypeError(
124-
"Expected: documents to be a list, but got {}".format(type(documents))
125-
)
126-
127-
filepaths = leaf_node[0]["filepaths"]
128-
129-
doc_dict = cls._split_docs_by_filetype(documents, filepaths)
130-
if len(doc_dict.keys()) > 1:
131-
raise ValueError("Received multiple document types")
132-
if len(doc_dict.keys()) < 1:
133-
raise ValueError("Received no document types")
134-
135-
if FileTypes.MD in doc_dict:
136-
doc_contents = [d for d, _ in doc_dict[FileTypes.MD]]
137-
return TextSplitChunker(
138-
doc_contents,
139-
server_ctx_size,
140-
chunk_word_count,
141-
output_dir,
142-
)
143-
144-
if FileTypes.PDF in doc_dict:
145-
doc_paths = [p for _, p in doc_dict[FileTypes.PDF]]
146-
return ContextAwareChunker(
147-
doc_paths,
148-
filepaths,
149-
output_dir,
150-
chunk_word_count,
151-
tokenizer_model_name,
152-
docling_model_path=docling_model_path,
153-
)
15482

155-
@staticmethod
156-
def _split_docs_by_filetype(
157-
documents: List[str], filepaths: List[Path]
158-
) -> DefaultDict[FileTypes, List[Tuple[str, Path]]]:
159-
"""Separate documents into lists based on their filetype.
160-
161-
Currently, only Markdown and PDF are supported.
162-
Args:
163-
documents (List[str]): A list of the document contents as strings
164-
filepaths (List[Path]): Corresponding document filepaths
165-
Returns:
166-
DefaultDict: Dictionary with either ".md" or ".pdf" as a key.
167-
Markdown items contain document contents, PDF items contain
168-
paths to documents.
169-
"""
170-
doc_dict = defaultdict(list)
171-
for doc, path in zip(documents, filepaths):
172-
if path.suffix == ".md":
173-
# append doc contents
174-
doc_dict[FileTypes.MD].append((doc, path))
175-
elif path.suffix == ".pdf":
176-
# append doc paths
177-
doc_dict[FileTypes.PDF].append((doc, path))
178-
else:
179-
raise ValueError(
180-
f"Received document of type .{path.suffix}, which is not a supported filetype"
181-
)
182-
return doc_dict
183-
184-
185-
class TextSplitChunker(ChunkerBase):
83+
class DocumentChunker: # pylint: disable=too-many-instance-attributes
18684
def __init__(
18785
self,
188-
document_contents: List | str,
189-
server_ctx_size: int,
190-
chunk_word_count: int,
86+
document_paths: List[Path],
19187
output_dir: Path,
88+
tokenizer_model_name: str | Path,
89+
docling_model_path: Optional[Path] = None,
90+
server_ctx_size: int = 4096,
91+
chunk_word_count: int = 1024,
19292
):
193-
self.document_contents = document_contents
194-
self.server_ctx_size = server_ctx_size
195-
self.chunk_word_count = chunk_word_count
196-
self.output_dir = output_dir
93+
if not document_paths:
94+
raise ValueError("Provided empty list of documents")
19795

198-
def chunk_documents(self) -> List:
199-
"""Naively chunk markdown documents based on the word count provided by the user.
200-
Returns:
201-
List[str]: List of chunked documents.
202-
"""
203-
num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
204-
if num_tokens_per_doc > int(self.server_ctx_size - 1024):
205-
raise ValueError(
206-
"Error: {}".format(
207-
str(
208-
f"Given word count ({self.chunk_word_count}) per doc will exceed the server context window size ({self.server_ctx_size})"
209-
)
210-
)
211-
)
212-
if self.document_contents == []:
213-
return []
96+
document_dict = split_docs_by_filetype(document_paths)
21497

215-
chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
216-
return chunk_markdowns(self.document_contents, chunk_size)
98+
if len(document_dict) > 1:
99+
raise ValueError("Provided multiple document types")
217100

101+
# We know there is only 1 key, value pair, so we take the first
102+
self.document_filetype, self.document_paths = next(iter(document_dict.items()))
103+
self.docling_model_path = docling_model_path
104+
self.converter = self._init_docling_converter()
218105

219-
class ContextAwareChunker(ChunkerBase): # pylint: disable=too-many-instance-attributes
220-
def __init__(
221-
self,
222-
document_paths,
223-
filepaths,
224-
output_dir: Path,
225-
chunk_word_count: int,
226-
tokenizer_model_name: Optional[str],
227-
docling_model_path=None,
228-
):
229-
self.document_paths = document_paths
230-
self.filepaths = filepaths
231106
self.output_dir = self._path_validator(output_dir)
107+
self.server_ctx_size = server_ctx_size
232108
self.chunk_word_count = chunk_word_count
233109
self.tokenizer = self.create_tokenizer(tokenizer_model_name)
234-
self.docling_model_path = docling_model_path
235-
236-
def chunk_documents(self) -> List:
237-
"""Semantically chunk PDF documents.
238110

239-
Returns:
240-
List: a list of chunks from the documents
241-
"""
111+
def _init_docling_converter(self):
112+
"""Initialize docling converter with filetype-specific configurations"""
242113
# triggers torch loading, import lazily
243114
# pylint: disable=import-outside-toplevel
244115
# Third Party
245116
from docling.document_converter import DocumentConverter, PdfFormatOption
246117
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
247118

248-
if self.document_paths == []:
249-
return []
250-
251119
if self.docling_model_path is None:
252120
logger.info("Docling models not found on disk, downloading models...")
253121
self.docling_model_path = StandardPdfPipeline.download_models_hf()
@@ -258,17 +126,29 @@ def chunk_documents(self) -> List:
258126
artifacts_path=self.docling_model_path,
259127
do_ocr=False,
260128
)
129+
261130
ocr_options = resolve_ocr_options()
262131
if ocr_options is not None:
263132
pipeline_options.do_ocr = True
264133
pipeline_options.ocr_options = ocr_options
265134

266-
converter = DocumentConverter(
135+
return DocumentConverter(
267136
format_options={
268137
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
269138
}
270139
)
271-
parsed_documents = converter.convert_all(self.filepaths)
140+
141+
def chunk_documents(self) -> List:
142+
"""Split a list of documents into chunks
143+
144+
Returns:
145+
List: a list of chunks from the documents
146+
"""
147+
148+
if self.document_paths == []:
149+
return []
150+
151+
parsed_documents = self.converter.convert_all(self.document_paths)
272152

273153
docling_artifacts_path = self.export_documents(parsed_documents)
274154

@@ -348,7 +228,7 @@ def fuse_texts(
348228
return fused_texts
349229

350230
@staticmethod
351-
def create_tokenizer(model_name: Optional[str]):
231+
def create_tokenizer(model_path: str | Path):
352232
"""
353233
Create a tokenizer instance from a pre-trained model or a local directory.
354234
@@ -363,10 +243,8 @@ def create_tokenizer(model_name: Optional[str]):
363243
# Third Party
364244
from transformers import AutoTokenizer
365245

366-
if model_name is None:
367-
raise TypeError("No model path provided")
368-
369-
model_path = Path(model_name)
246+
if not isinstance(model_path, Path):
247+
model_path = Path(model_path)
370248
error_info_message = (
371249
"Please run `ilab model download {download_args}` and try again"
372250
)
@@ -486,7 +364,7 @@ def get_table_page_number(self, json_book, idx):
486364
prev_page_num = book_element["prov"][0]["page"]
487365
break
488366
for book_element in json_book["main-text"][idx:]:
489-
if "prov" in book_element:
367+
if "prov" in book_element and book_element["prov"]:
490368
next_page_num = book_element["prov"][0]["page"]
491369
break
492370
if prev_page_num is not None and next_page_num is not None:
@@ -545,8 +423,14 @@ def build_chunks_from_docling_json(
545423
current_book_page_number = self.get_table_page_number(
546424
json_book, idx
547425
)
426+
book_text = self.get_table(json_book, book_element["$ref"])
427+
elif book_element["prov"]:
428+
current_book_page_number = book_element["prov"][0][
429+
"page"
430+
] # TODO export to function to handle empty ["prov"]
431+
book_text = book_element["text"]
548432
else:
549-
current_book_page_number = book_element["prov"][0]["page"]
433+
current_book_page_number = None
550434
book_text = book_element["text"]
551435

552436
if book_element["type"] == "subtitle-level-1":
@@ -599,8 +483,6 @@ def build_chunks_from_docling_json(
599483

600484
if book_element["type"] == "paragraph":
601485
book_text = self.add_heading_formatting(book_text)
602-
elif book_element["type"] == "table":
603-
book_text = self.get_table(json_book, book_element["$ref"])
604486
if "## References" in book_text or "## Acknowledgements" in book_text:
605487
# For research papers we ignore everything after this sections
606488
break

0 commit comments

Comments
 (0)