From 67909180190e1d12ce9c1ed411b60e584548729e Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 12 Feb 2025 15:23:54 -0500 Subject: [PATCH 01/33] Update PDF extraction and OCR options for hybrid chunking Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 226 ++------------------------ src/instructlab/sdg/utils/taxonomy.py | 101 ++++++------ 2 files changed, 63 insertions(+), 264 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 52c15ba1..2dc3a48f 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -8,9 +8,11 @@ # Third Party from datasets import Dataset +from docling.chunking import HybridChunker from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( + AcceleratorOptions, EasyOcrOptions, OcrOptions, PdfPipelineOptions, @@ -57,7 +59,10 @@ def resolve_ocr_options() -> OcrOptions: # Third Party from docling.models.easyocr_model import EasyOcrModel - _ = EasyOcrModel(True, ocr_options) + accelerator_options = AcceleratorOptions() + _ = EasyOcrModel( + True, ocr_options, None, accelerator_options=accelerator_options + ) return ocr_options except ImportError: # no easyocr either, so don't use any OCR @@ -185,11 +190,11 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset: with open(json_fp, "r", encoding="utf-8") as f: data = json.load(f) - chunks = self.build_chunks_from_docling_json( - data, - max_token_per_chunk=500, - tokenizer=self.tokenizer, - ) + chunker = HybridChunker(tokenizer=self.tokenizer, max_token_per_chunk=500) + chunk_iter = chunker.chunk( + dl_doc=data + ) # Use hybrid chunker to chunk the document + chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter] fused_texts = self.fuse_texts(chunks, 200) num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) @@ -288,215 +293,6 @@ def get_token_count(self, text, tokenizer): """ return len(tokenizer.tokenize(text)) - def add_heading_formatting(self, text): - """ - Add heading formatting to the text if the first part is short. - Args: - text (str): The input text to format. - Returns: - str: Formatted text with headings applied. - """ - text = text.split(".") - - # Change this from hardcoded to something more flexible - if len(text) > 1 and len(text[0].split(" ")) < 3: - text = f"**{text[0]}**" + ".".join(text[1:]) - else: - text = ".".join(text) - return text - - def generate_table_from_parsed_rep(self, item): - """ - Generate the table from the parsed representation and return as a string. - Args: - item (dict): Parsed representation of a table. - Returns: - str: Formatted table as a string. - """ - caption = "" - if "text" in item: - caption = item["text"] - - data = item["data"] - - if len(data) <= 1 or len(data[0]) <= 1: - return "" - - table = [] - for _, row in enumerate(data): - trow = [] - for _, cell in enumerate(row): - trow.append(cell["text"]) - table.append(trow) - - table_text = tabulate(table, tablefmt="github") - if caption: - table_text += f"\nCaption: {caption}\n" - return table_text - - def get_table(self, json_book, table_ref): - """ - Retrieve a table from a document based on a reference string. - Args: - json_book (dict): JSON representation of the document. - table_ref (str): Reference path to the table within the document. - Returns: - str: Formatted table string. - """ - parts = table_ref.split("/") - table_text = self.generate_table_from_parsed_rep( - json_book[parts[1]][int(parts[2])] - ) - return table_text - - def get_table_page_number(self, json_book, idx): - """ - Get the page number of a table or other document element. - Args: - json_book (dict): JSON representation of the document. - idx (int): Index of the element in the document. - Returns: - int: Page number of the element. - """ - prev_page_num, next_page_num = None, None - for book_element in json_book["main-text"][idx - 1 :: -1]: - if "prov" in book_element: - prev_page_num = book_element["prov"][0]["page"] - break - for book_element in json_book["main-text"][idx:]: - if "prov" in book_element and book_element["prov"]: - next_page_num = book_element["prov"][0]["page"] - break - if prev_page_num is not None and next_page_num is not None: - if prev_page_num == next_page_num: - return prev_page_num - return next_page_num - if prev_page_num is not None: - return prev_page_num - if next_page_num is not None: - return next_page_num - - def build_chunks_from_docling_json( - self, - json_book, - max_token_per_chunk, - tokenizer, - keep_same_page_thing_together=False, - chunking_criteria=None, - ): - """ - Build document chunks from a docling JSON representation. - Args: - json_book (dict): JSON document to process. - max_token_per_chunk (int): Maximum token count per chunk. - tokenizer (AutoTokenizer): Tokenizer instance to use. - keep_same_page_thing_together (bool): Whether to keep content on the same page together. - chunking_criteria (callable): Custom function for determining chunk breaks. - Returns: - list: List of document chunks. - """ - current_buffer = [] - document_chunks = [] - prev_page_number = None - book_title = None - - for idx, book_element in enumerate(json_book["main-text"]): - if book_element["type"] in [ - "page-footer", - "picture", - "reference", - "meta-data", - "figure", - "page-header", - ]: - continue - if book_element["type"] == "footnote": - current_book_page_number = book_element["prov"][0]["page"] - elif book_element["type"] in [ - "subtitle-level-1", - "paragraph", - "table", - "title", - "equation", - ]: # 'page-header', - if book_element["type"] == "table": - current_book_page_number = self.get_table_page_number( - json_book, idx - ) - book_text = self.get_table(json_book, book_element["$ref"]) - elif book_element["prov"]: - current_book_page_number = book_element["prov"][0][ - "page" - ] # TODO export to function to handle empty ["prov"] - book_text = book_element["text"] - else: - current_book_page_number = None - book_text = book_element["text"] - - if book_element["type"] == "subtitle-level-1": - if book_title is None: - book_title = book_text - book_text = f"# Title: **{book_text}**" - else: - book_text = f"## **{book_text}**" - - if book_element["type"] == "title": - book_text = f"# **{book_text}**" - if book_element["type"] == "page-header": - book_text = f"Page Header: **{book_text}**\n\n" - - if chunking_criteria is not None: - # custom break function that can be used to chunk document - if chunking_criteria(book_text): - document_chunks.append("\n\n".join(current_buffer)) - current_buffer = [] - elif ( - prev_page_number is not None - and prev_page_number != current_book_page_number - ) and keep_same_page_thing_together: - document_chunks.append("\n\n".join(current_buffer)) - current_buffer = [] - else: - if ( - self.get_token_count("\n\n".join(current_buffer), tokenizer) - >= max_token_per_chunk - and len(current_buffer) > 1 - ): - chunk_text = "\n\n".join(current_buffer[:-1]) - logger.debug( - f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}" - ) - - document_chunks.append("\n\n".join(current_buffer[:-1])) - - if ( - self.get_token_count(current_buffer[-1], tokenizer) - >= max_token_per_chunk - ): - logger.debug( - f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}" - ) - document_chunks.append(current_buffer[-1]) - current_buffer = [] - else: - current_buffer = current_buffer[-1:] - - if book_element["type"] == "paragraph": - book_text = self.add_heading_formatting(book_text) - if "## References" in book_text or "## Acknowledgements" in book_text: - # For research papers we ignore everything after this sections - break - current_buffer.append(book_text) - - try: - prev_page_number = current_book_page_number - except Exception as e: # pylint: disable=broad-exception-caught - logger.error(f"Error processing book element: {book_element}, {str(e)}") - - if "\n\n".join(current_buffer) not in document_chunks: - document_chunks.append("\n\n".join(current_buffer)) - return document_chunks - def export_documents(self, converted_docs: Iterable[ConversionResult]): """Write converted documents to json files diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index ed0d7940..2aec5b7a 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -13,7 +13,7 @@ from datasets import Dataset # pylint: disable=no-name-in-module -from docling_parse.docling_parse import pdf_parser_v1 +from docling_parse.pdf_parsers import pdf_parser_v2 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS from instructlab.schema.taxonomy import ( TaxonomyMessageFormat, @@ -28,7 +28,7 @@ from .chunkers import DocumentChunker # Initialize the pdf parser -PDFParser = pdf_parser_v1() +PDFParser = pdf_parser_v2("error") logger = logging.getLogger(__name__) @@ -122,13 +122,55 @@ def _string_contains_html(s: str) -> bool: return bool(html_tag_pattern.search(s)) +def extract_text_from_pdf(file_path: str) -> str: + """ + Extracts text from a PDF file using the docling PdfParser API. + + Args: + file_path (str): Path to the PDF file. + + Returns: + str: The extracted text from all pages. + """ + pdf_text = "" + # Load the PDF document in lazy mode. (Do not pass a doc_key here.) + pdf_doc = PDFParser.load(path_or_stream=file_path, lazy=True) + if pdf_doc is None: + logger.error(f"Failed to load PDF: {file_path}") + return "" + + num_pages = pdf_doc.number_of_pages() + logger.info(f"PDF '{file_path}' has {num_pages} pages.") + + # Note: The high-level API expects page numbers to be 1-indexed. + for page_no in range(1, num_pages + 1): + try: + pdf_page = pdf_doc.get_page(page_no=page_no) + text_lines = pdf_page.sanitized.export_to_textlines( + add_fontkey=True, add_fontname=False + ) + page_text = "\n".join(text_lines) + pdf_text += page_text + "\n" + except Exception as e: + logger.warning( + f"Error extracting text from page {page_no} of '{file_path}': {e}" + ) + continue + + # Unload the document to free memory + PDFParser.unload_document(file_path) + logger.info(f"Unloaded PDF document: {file_path}") + + return pdf_text + + def _get_documents( source: Dict[str, Union[str, List[str]]], skip_checkout: bool = False, document_output_dir: Path = None, ) -> Tuple[List[str], List[Path]]: """ - Retrieve the content of files (Markdown and PDF) from a Git repository. + Retrieve the content of files (Markdown and PDFs) from a Git repository. Args: source (dict): Source info containing repository URL, commit hash, and list of file patterns. @@ -186,55 +228,16 @@ def _get_documents( ) elif file_path.lower().endswith(".pdf"): - # Process PDF files using docling_parse's pdf_parser_v1 - doc_key = f"key_{os.path.basename(file_path)}" # Unique document key - logger.info(f"Loading PDF document from {file_path}") - - success = PDFParser.load_document(doc_key, file_path) - if not success: - logger.warning( - f"Failed to load PDF document: {file_path}" - ) - continue - - num_pages = PDFParser.number_of_pages(doc_key) - logger.info(f"PDF '{file_path}' has {num_pages} pages.") - - pdf_text = "" - - for page in range(num_pages): - try: - json_doc = PDFParser.parse_pdf_from_key_on_page( - doc_key, page - ) - if "pages" not in json_doc or not json_doc["pages"]: - logger.warning( - f"Page {page + 1} could not be parsed in '{file_path}'" - ) - continue - - json_page = json_doc["pages"][0] - - # Extract text from cells - for cell in json_page.get("cells", []): - text = cell.get("content", {}).get( - "rnormalized", "" - ) - if text.strip(): # Only append non-empty text - pdf_text += text.strip() + "\n" - except Exception as page_error: # pylint: disable=broad-exception-caught - logger.warning( - f"Error parsing page {page + 1} of '{file_path}': {page_error}" - ) - continue - + # Process PDF files using docling_parse's pdf_parser_v2 + logger.info(f"Extracting text from PDF file: {file_path}") + pdf_text = extract_text_from_pdf(file_path) if pdf_text: file_contents.append(pdf_text) filepaths.append(Path(file_path)) - - # Unload the document to free memory - PDFParser.unload_document(doc_key) - logger.info(f"Unloaded PDF document: {file_path}") + else: + logger.warning( + f"PDF file {file_path} could not be processed" + ) else: logger.info(f"Skipping unsupported file type: {file_path}") From 51fb86d04baee96412f377174430facd5f6e2693 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 12 Feb 2025 15:59:17 -0500 Subject: [PATCH 02/33] Update docling versions Signed-off-by: Aakanksha Duggal --- requirements.txt | 6 +++--- src/instructlab/sdg/utils/chunkers.py | 3 +-- src/instructlab/sdg/utils/taxonomy.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index ec3c012d..fbe526cf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 -docling[tesserocr]>=2.4.2,<=2.8.3; sys_platform != 'darwin' -docling>=2.4.2,<=2.8.3; sys_platform == 'darwin' -docling-parse>=2.0.0,<3.0.0 +docling[tesserocr]>=2.9.0; sys_platform != 'darwin' +docling>=2.9.0; sys_platform == 'darwin' +docling-parse>=3.3.0 GitPython>=3.1.42,<4.0.0 gguf>=0.6.0 httpx>=0.25.0,<1.0.0 diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 2dc3a48f..f3983713 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -19,7 +19,6 @@ TesseractOcrOptions, ) from langchain_text_splitters import Language, RecursiveCharacterTextSplitter -from tabulate import tabulate # First Party from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors @@ -190,7 +189,7 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset: with open(json_fp, "r", encoding="utf-8") as f: data = json.load(f) - chunker = HybridChunker(tokenizer=self.tokenizer, max_token_per_chunk=500) + chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500) chunk_iter = chunker.chunk( dl_doc=data ) # Use hybrid chunker to chunk the document diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 2aec5b7a..b8bc8de6 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -151,7 +151,7 @@ def extract_text_from_pdf(file_path: str) -> str: ) page_text = "\n".join(text_lines) pdf_text += page_text + "\n" - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught logger.warning( f"Error extracting text from page {page_no} of '{file_path}': {e}" ) From 90a7a4b6a2f4eaf7ca950db63b64a933cf0f3e52 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 12 Feb 2025 16:16:08 -0500 Subject: [PATCH 03/33] Update easyocr params Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index f3983713..4774b710 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -60,7 +60,7 @@ def resolve_ocr_options() -> OcrOptions: accelerator_options = AcceleratorOptions() _ = EasyOcrModel( - True, ocr_options, None, accelerator_options=accelerator_options + True, None, ocr_options, accelerator_options=accelerator_options ) return ocr_options except ImportError: From 19ba945935f35e46c1c6f99c477bb9a6b3673cf9 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 12 Feb 2025 16:27:06 -0500 Subject: [PATCH 04/33] Add docling-core[chunking] to requirements Signed-off-by: Aakanksha Duggal --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index fbe526cf..85c25711 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 +docling-core[chunking]>=2.9.0 docling[tesserocr]>=2.9.0; sys_platform != 'darwin' docling>=2.9.0; sys_platform == 'darwin' docling-parse>=3.3.0 From def01eaeabd089bb35052820516a234813f6a261 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Thu, 13 Feb 2025 11:04:35 -0500 Subject: [PATCH 05/33] Update accelerator options for easy ocr Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 44 +++++++++++++++++++-------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 4774b710..bd57caf0 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -12,6 +12,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( + AcceleratorDevice, AcceleratorOptions, EasyOcrOptions, OcrOptions, @@ -36,7 +37,12 @@ def _num_chars_from_tokens(num_tokens) -> int: return int(num_tokens * 4) # 1 token ~ 4 English character -def resolve_ocr_options() -> OcrOptions: +def resolve_ocr_options( + docling_model_path: Optional[Path] = None, +) -> Optional[OcrOptions]: + # Declare ocr_options explicitly as Optional[OcrOptions] + ocr_options: Optional[OcrOptions] = None + # First, attempt to use tesserocr try: ocr_options = TesseractOcrOptions() @@ -48,19 +54,30 @@ def resolve_ocr_options() -> OcrOptions: return ocr_options except ImportError: # No tesserocr, so try something else - pass + logger.warning("Tesseract not found, falling back to EasyOCR.") + try: - ocr_options = EasyOcrOptions() + ocr_options = EasyOcrOptions( + lang=["en"], + use_gpu=False, + confidence_threshold=0.5, + model_storage_directory=str(docling_model_path), + recog_network="standard", + download_enabled=True, + ) # Keep easyocr models on the CPU instead of GPU ocr_options.use_gpu = False + accelerator_options = AcceleratorOptions(device="cpu") # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel # Third Party from docling.models.easyocr_model import EasyOcrModel - accelerator_options = AcceleratorOptions() _ = EasyOcrModel( - True, None, ocr_options, accelerator_options=accelerator_options + enabled=True, + artifacts_path=None, + options=ocr_options, + accelerator_options=accelerator_options, ) return ocr_options except ImportError: @@ -131,7 +148,7 @@ def _init_docling_converter(self): do_ocr=False, ) - ocr_options = resolve_ocr_options() + ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path) if ocr_options is not None: pipeline_options.do_ocr = True pipeline_options.ocr_options = ocr_options @@ -190,10 +207,13 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset: data = json.load(f) chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500) - chunk_iter = chunker.chunk( - dl_doc=data - ) # Use hybrid chunker to chunk the document - chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter] + + try: + chunk_iter = chunker.chunk(dl_doc=data) + chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter] + except Exception as e: + logger.error(f"Error chunking document {json_fp}: {e}") + fused_texts = self.fuse_texts(chunks, 200) num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) @@ -317,11 +337,11 @@ def export_documents(self, converted_docs: Iterable[ConversionResult]): # Export Deep Search document JSON format: with (docling_artifacts_path / f"{doc_filename}.json").open("w") as fp: - fp.write(json.dumps(doc.legacy_document.export_to_dict())) + fp.write(json.dumps(doc.document.export_to_dict())) # Export Markdown format: with (docling_artifacts_path / f"{doc_filename}.md").open("w") as fp: - fp.write(doc.legacy_document.export_to_markdown()) + fp.write(doc.document.export_to_markdown()) else: logger.info(f"Document {doc.input.file} failed to convert.") failure_count += 1 From 635171cfb627642811e0bd2c149c077f5ff108a8 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 06:05:03 -0500 Subject: [PATCH 06/33] Update the pdf doc parser unload function for v2 parser Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/taxonomy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index b8bc8de6..73c424af 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -158,7 +158,7 @@ def extract_text_from_pdf(file_path: str) -> str: continue # Unload the document to free memory - PDFParser.unload_document(file_path) + pdf_doc.unload() logger.info(f"Unloaded PDF document: {file_path}") return pdf_text From 89bda7877441857d698cf7f370c5147ed82a8e7c Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 06:16:53 -0500 Subject: [PATCH 07/33] Update exception and initialize chunks Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index bd57caf0..9f3edb0f 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -12,7 +12,6 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, AcceleratorOptions, EasyOcrOptions, OcrOptions, @@ -207,11 +206,12 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset: data = json.load(f) chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500) + chunks = [] try: chunk_iter = chunker.chunk(dl_doc=data) chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter] - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught logger.error(f"Error chunking document {json_fp}: {e}") fused_texts = self.fuse_texts(chunks, 200) From 7b6f051b43cde2f57037b7b333748725214df7be Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 07:39:15 -0500 Subject: [PATCH 08/33] Update chunk documents to avoid xporting to JSON and re-reading Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 55 +++++++++------------------ 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 9f3edb0f..c716b791 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -165,19 +165,27 @@ def chunk_documents(self) -> List: List: a list of chunks from the documents """ - if self.document_paths == []: - return [] - parsed_documents = self.converter.convert_all(self.document_paths) + all_chunks = [] + for conversion_result in parsed_documents: + doc = conversion_result.document + chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500) + try: + chunk_iter = chunker.chunk(dl_doc=doc) + chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter] + except Exception as e: + logger.error( + f"Error chunking document {conversion_result.input.file}: {e}" + ) + chunks = [] - docling_artifacts_path = self.export_documents(parsed_documents) - - docling_json_paths = list(docling_artifacts_path.glob("*.json")) - chunks = [] - for json_fp in docling_json_paths: - chunks.extend(self._process_parsed_docling_json(json_fp)) + fused_texts = self.fuse_texts(chunks, 200) + num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) + chunk_size = _num_chars_from_tokens(num_tokens_per_doc) + final_chunks = chunk_markdowns(fused_texts, chunk_size) + all_chunks.extend(final_chunks) - return chunks + return all_chunks def _path_validator(self, path) -> Path: """ @@ -193,33 +201,6 @@ def _path_validator(self, path) -> Path: raise FileNotFoundError(f"{path} does not exist.") return path - def _process_parsed_docling_json(self, json_fp: Path) -> Dataset: - """ - Process the parsed docling json file and return a dataset. - Args: - json_fp (Path): Path to the parsed docling json file. - Returns: - List: a list of chunks built from the provided json file - """ - logger.info(f"Processing parsed docling json file: {json_fp}") - with open(json_fp, "r", encoding="utf-8") as f: - data = json.load(f) - - chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500) - chunks = [] - - try: - chunk_iter = chunker.chunk(dl_doc=data) - chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter] - except Exception as e: # pylint: disable=broad-exception-caught - logger.error(f"Error chunking document {json_fp}: {e}") - - fused_texts = self.fuse_texts(chunks, 200) - - num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count) - chunk_size = _num_chars_from_tokens(num_tokens_per_doc) - return chunk_markdowns(fused_texts, chunk_size) - def fuse_texts( self, text_list: List, short_length_threshold: int = 130 ) -> List[str]: From f6c0eb7b6464235c4857aaceb860f40264764595 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 08:06:54 -0500 Subject: [PATCH 09/33] Update tests for chunk documents, check for empty chunks Signed-off-by: Aakanksha Duggal --- tests/functional/test_chunkers.py | 37 ++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index f06db504..d3b14fbe 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -32,10 +32,10 @@ def tokenizer_model_name(): @pytest.mark.parametrize( - "doc_type, expected_chunks, contains_text", + "doc_type, expected_chunks", [ - ("pdf", 9, "Phoenix is a minor constellation"), - ("md", 7, None), # Assuming there's no specific text to check in Markdown + ("pdf", 9), + ("md", 7), # Assuming there's no specific text to check in Markdown ], ) def test_chunk_documents( @@ -58,3 +58,34 @@ def test_chunk_documents( assert contains_text in chunks[0] for chunk in chunks: assert len(chunk) < 2500 + + +def test_chunk_documents( + tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks +): + """ + Generalized test function for chunking documents. + Instead of checking for a particular text, we verify that: + - The number of chunks is greater than a minimum expected value. + - No chunk is empty. + - Each chunk is within a reasonable length. + """ + document_path = test_paths[doc_type] + chunker = DocumentChunker( + document_paths=[document_path], + output_dir=tmp_path, + tokenizer_model_name=tokenizer_model_name, + server_ctx_size=4096, + chunk_word_count=500, + ) + chunks = chunker.chunk_documents() + + # Check that we have more chunks than expected + assert ( + len(chunks) > expected_chunks + ), f"Expected more than {expected_chunks} chunks, got {len(chunks)}" + + # Check that no chunk is empty and each chunk is within the max allowed length (2500 characters) + for chunk in chunks: + assert chunk, "Chunk should not be empty" + assert len(chunk) < 2500, f"Chunk length {len(chunk)} exceeds maximum allowed" From d1b7e31d31e171106e2ab56df4ca84a779eea8e0 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 11:18:22 -0500 Subject: [PATCH 10/33] Update import for chunking Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index c716b791..a33f032b 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -8,7 +8,6 @@ # Third Party from datasets import Dataset -from docling.chunking import HybridChunker from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( @@ -18,6 +17,7 @@ PdfPipelineOptions, TesseractOcrOptions, ) +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from langchain_text_splitters import Language, RecursiveCharacterTextSplitter # First Party From 1f1cfd352bb55e76ae502d5c2d3e8fc72ac5c665 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 11:46:46 -0500 Subject: [PATCH 11/33] Adding transformers and semchunk because module requires chunking extra Signed-off-by: Aakanksha Duggal --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 85c25711..ce24e735 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,8 +12,10 @@ instructlab-schema>=0.4.0 jinja2>=3.0.0 langchain-text-splitters openai>=1.13.3,<2.0.0 +semchunk >= 2.2.0 sentencepiece>=0.2.0 tabulate>=0.9.0 +transformers >=4.34.0 # Note: this dependency goes along with langchain-text-splitters and may be # removed once that one is removed. From f1477b78acbb57d189bb39788c9cf8e0e2bd3b8c Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 11:57:07 -0500 Subject: [PATCH 12/33] Take transformers out of the block from leanimports Signed-off-by: Aakanksha Duggal --- tests/testdata/leanimports.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testdata/leanimports.py b/tests/testdata/leanimports.py index f6a50484..48a3bfe8 100644 --- a/tests/testdata/leanimports.py +++ b/tests/testdata/leanimports.py @@ -4,7 +4,7 @@ import sys # block slow imports -for unwanted in ["deepspeed", "llama_cpp", "torch", "transformers", "vllm"]: +for unwanted in ["deepspeed", "llama_cpp", "torch", "vllm"]: # importlib raises ModuleNotFound when sys.modules value is None. assert unwanted not in sys.modules sys.modules[unwanted] = None # type: ignore[assignment] From 5c28c5e2d2159505d1c6797072c0b478411a05cf Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 12:11:16 -0500 Subject: [PATCH 13/33] Update test_chunkers to update functional tests Signed-off-by: Aakanksha Duggal --- tests/functional/test_chunkers.py | 41 ++++++++----------------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index d3b14fbe..2924653b 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -25,52 +25,31 @@ def test_paths(): } -@pytest.fixture +@pytest.fixture(scope="module") def tokenizer_model_name(): """Fixture to return the path to the tokenizer model.""" return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") @pytest.mark.parametrize( - "doc_type, expected_chunks", + "document_type, expected_chunks", [ ("pdf", 9), - ("md", 7), # Assuming there's no specific text to check in Markdown + ("md", 7), ], ) def test_chunk_documents( - tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text + tmp_path, tokenizer_model_name, test_paths, document_type, expected_chunks ): """ Generalized test function for chunking documents. - """ - document_path = test_paths[doc_type] - chunker = DocumentChunker( - document_paths=[document_path], - output_dir=tmp_path, - tokenizer_model_name=tokenizer_model_name, - server_ctx_size=4096, - chunk_word_count=500, - ) - chunks = chunker.chunk_documents() - assert len(chunks) > expected_chunks - if contains_text: - assert contains_text in chunks[0] - for chunk in chunks: - assert len(chunk) < 2500 - -def test_chunk_documents( - tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks -): - """ - Generalized test function for chunking documents. - Instead of checking for a particular text, we verify that: - - The number of chunks is greater than a minimum expected value. + Verifies that: + - The number of chunks is greater than the expected minimum. - No chunk is empty. - - Each chunk is within a reasonable length. + - Each chunk's length is less than 2500 characters. """ - document_path = test_paths[doc_type] + document_path = test_paths[document_type] chunker = DocumentChunker( document_paths=[document_path], output_dir=tmp_path, @@ -80,12 +59,12 @@ def test_chunk_documents( ) chunks = chunker.chunk_documents() - # Check that we have more chunks than expected + # Check that we have more chunks than expected. assert ( len(chunks) > expected_chunks ), f"Expected more than {expected_chunks} chunks, got {len(chunks)}" - # Check that no chunk is empty and each chunk is within the max allowed length (2500 characters) + # Check that no chunk is empty and each chunk's length is within the allowed limit. for chunk in chunks: assert chunk, "Chunk should not be empty" assert len(chunk) < 2500, f"Chunk length {len(chunk)} exceeds maximum allowed" From f24e852030105f61f6a8a718c891a5d1815b0cbc Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Tue, 25 Feb 2025 13:22:01 -0500 Subject: [PATCH 14/33] Update lean imports to not run into OOM error Signed-off-by: Aakanksha Duggal Co-authored-by: Courtney Pacheco --- src/instructlab/sdg/utils/chunkers.py | 2 +- tests/testdata/leanimports.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index a33f032b..22fb94d7 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -173,7 +173,7 @@ def chunk_documents(self) -> List: try: chunk_iter = chunker.chunk(dl_doc=doc) chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter] - except Exception as e: + except Exception as e: # pylint: disable=broad-exception-caught logger.error( f"Error chunking document {conversion_result.input.file}: {e}" ) diff --git a/tests/testdata/leanimports.py b/tests/testdata/leanimports.py index 48a3bfe8..c8f0504f 100644 --- a/tests/testdata/leanimports.py +++ b/tests/testdata/leanimports.py @@ -9,6 +9,13 @@ assert unwanted not in sys.modules sys.modules[unwanted] = None # type: ignore[assignment] +# Try to import in your PR to see if this works around the issue. If not, print an error +try: + # Third Party + import docling_core +except ImportError as e: + print(f"Could not import `docling_core` because: {e}") + # First Party # This will trigger errors if any of the import chain tries to load # the unwanted modules From c9878c1693c373b8cc58f1bda49194122fb4c477 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Wed, 26 Feb 2025 02:14:11 -0500 Subject: [PATCH 15/33] Update src/instructlab/sdg/utils/chunkers.py Deactivate MPS acceleration on Github CI to avoid OOM error Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 22fb94d7..f5ac01dd 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -4,13 +4,16 @@ from typing import Dict, Iterable, List, Optional import json import logging +import os import re +import sys # Third Party from datasets import Dataset from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( + AcceleratorDevice, AcceleratorOptions, EasyOcrOptions, OcrOptions, @@ -146,7 +149,11 @@ def _init_docling_converter(self): artifacts_path=self.docling_model_path, do_ocr=False, ) - + # deactivate MPS acceleration on Github CI + if os.getenv("CI") and sys.platform == "darwin": + pipeline_options.accelerator_options = AcceleratorOptions( + device=AcceleratorDevice.CPU + ) ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path) if ocr_options is not None: pipeline_options.do_ocr = True From 30bd3b5fadbf17f4eef0c78be6773aa46a4e58b2 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Thu, 27 Feb 2025 09:58:40 -0500 Subject: [PATCH 16/33] Update use_gpu to None for easy ocr Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index f5ac01dd..0b33410f 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -68,7 +68,7 @@ def resolve_ocr_options( download_enabled=True, ) # Keep easyocr models on the CPU instead of GPU - ocr_options.use_gpu = False + ocr_options.use_gpu = None accelerator_options = AcceleratorOptions(device="cpu") # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel From 0376926caca2fb6db78b5de43b3fed035c7b67c2 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Thu, 27 Feb 2025 10:20:47 -0500 Subject: [PATCH 17/33] Increase from 1.7 to a larger value to avoid the PyTorch MPS backend running OOM Signed-off-by: Aakanksha Duggal Co-authored-by: Courtney Pacheco --- .github/workflows/test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a59b6d47..e27eedfd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -107,6 +107,9 @@ jobs: - name: Run unit and functional tests with tox run: | tox + env: + # Increase from 1.7 to a greater value to avoid the PyTorch MPS backend running OOM. + PYTORCH_MPS_HIGH_WATERMARK_RATIO: 2.0 - name: Remove llama-cpp-python from cache if: always() From 7709f0c4dcdce57720303ec9173bf943660fdca8 Mon Sep 17 00:00:00 2001 From: Aakanksha Duggal Date: Thu, 27 Feb 2025 19:56:51 -0500 Subject: [PATCH 18/33] Remove docling core test from lean imports, add transformers back and import semchunk and transformers in chunkers.py Signed-off-by: Aakanksha Duggal --- src/instructlab/sdg/utils/chunkers.py | 2 ++ tests/testdata/leanimports.py | 9 +-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 0b33410f..e879e4f0 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -22,6 +22,8 @@ ) from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from langchain_text_splitters import Language, RecursiveCharacterTextSplitter +import semchunk +import transformers # First Party from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors diff --git a/tests/testdata/leanimports.py b/tests/testdata/leanimports.py index c8f0504f..f6a50484 100644 --- a/tests/testdata/leanimports.py +++ b/tests/testdata/leanimports.py @@ -4,18 +4,11 @@ import sys # block slow imports -for unwanted in ["deepspeed", "llama_cpp", "torch", "vllm"]: +for unwanted in ["deepspeed", "llama_cpp", "torch", "transformers", "vllm"]: # importlib raises ModuleNotFound when sys.modules value is None. assert unwanted not in sys.modules sys.modules[unwanted] = None # type: ignore[assignment] -# Try to import in your PR to see if this works around the issue. If not, print an error -try: - # Third Party - import docling_core -except ImportError as e: - print(f"Could not import `docling_core` because: {e}") - # First Party # This will trigger errors if any of the import chain tries to load # the unwanted modules From ec99a89e539bfb2f7169a570f9dde83663839f5e Mon Sep 17 00:00:00 2001 From: Cloud User Date: Tue, 11 Mar 2025 07:19:27 +0000 Subject: [PATCH 19/33] fix: Move docling_core import inside method to avoid top-level transformers import Signed-off-by: Cloud User --- src/instructlab/sdg/utils/chunkers.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index e879e4f0..e7bdf820 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -20,10 +20,7 @@ PdfPipelineOptions, TesseractOcrOptions, ) -from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from langchain_text_splitters import Language, RecursiveCharacterTextSplitter -import semchunk -import transformers # First Party from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors @@ -63,14 +60,12 @@ def resolve_ocr_options( try: ocr_options = EasyOcrOptions( lang=["en"], - use_gpu=False, + use_gpu=None, confidence_threshold=0.5, model_storage_directory=str(docling_model_path), recog_network="standard", download_enabled=True, ) - # Keep easyocr models on the CPU instead of GPU - ocr_options.use_gpu = None accelerator_options = AcceleratorOptions(device="cpu") # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel @@ -173,6 +168,10 @@ def chunk_documents(self) -> List: Returns: List: a list of chunks from the documents """ + # Move docling_core import inside method where it's used to avoid importing transformers at top level + # pylint: disable=import-outside-toplevel + # Third Party + from docling_core.transforms.chunker.hybrid_chunker import HybridChunker parsed_documents = self.converter.convert_all(self.document_paths) all_chunks = [] From 09e7b09ec98b6d2ea7da89763d005d13c50916b3 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Tue, 11 Mar 2025 07:48:21 +0000 Subject: [PATCH 20/33] fix: Remove semchunk and transformers from requirements.txt Signed-off-by: Cloud User --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index ce24e735..85c25711 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,10 +12,8 @@ instructlab-schema>=0.4.0 jinja2>=3.0.0 langchain-text-splitters openai>=1.13.3,<2.0.0 -semchunk >= 2.2.0 sentencepiece>=0.2.0 tabulate>=0.9.0 -transformers >=4.34.0 # Note: this dependency goes along with langchain-text-splitters and may be # removed once that one is removed. From bd3488b91e6fd1430dd4a80a2ca1b6c414f8ddcf Mon Sep 17 00:00:00 2001 From: Cloud User Date: Tue, 11 Mar 2025 08:09:33 +0000 Subject: [PATCH 21/33] ci: Update GitHub Actions workflow to use macos-latest-xlarge runners Signed-off-by: Cloud User --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e27eedfd..928fb826 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -50,9 +50,9 @@ jobs: - "ubuntu-latest" include: - python: "3.11" - platform: "macos-latest" + platform: "macos-latest-xlarge" - python: "3.12" - platform: "macos-latest" + platform: "macos-latest-xlarge" steps: - name: "Harden Runner" uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 From 7c11b3ca7ab77d249d3724900fd92129e422acd3 Mon Sep 17 00:00:00 2001 From: Cloud User Date: Tue, 11 Mar 2025 08:20:19 +0000 Subject: [PATCH 22/33] ci: Update GitHub Actions workflow free disk space condition to reflect mac-os-latest-xlarge platform Signed-off-by: Cloud User --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 928fb826..50f868fb 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -66,7 +66,7 @@ jobs: fetch-depth: 0 - name: Free disk space - if: matrix.platform != 'macos-latest' + if: matrix.platform != 'macos-latest-xlarge' uses: ./.github/actions/free-disk-space - name: Install the expect package From 2cdb25e9f310cb1bf24d1e64f6745c5a7a478d2c Mon Sep 17 00:00:00 2001 From: Cloud User Date: Tue, 11 Mar 2025 14:49:43 +0000 Subject: [PATCH 23/33] test: Add fixture to force CPU usage on macOS CI environments Signed-off-by: Cloud User --- tests/functional/test_chunkers.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index 2924653b..e3be1bce 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -1,9 +1,11 @@ # Standard from pathlib import Path import os +import sys # Third Party import pytest +import torch # First Party from instructlab.sdg.utils.chunkers import DocumentChunker @@ -31,6 +33,13 @@ def tokenizer_model_name(): return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") +@pytest.fixture(scope="module") +def force_cpu_on_macos_ci(): + """Force CPU usage on macOS CI environments.""" + if os.getenv("CI") and sys.platform == "darwin": + torch.backends.mps.enabled = False + + @pytest.mark.parametrize( "document_type, expected_chunks", [ @@ -39,7 +48,12 @@ def tokenizer_model_name(): ], ) def test_chunk_documents( - tmp_path, tokenizer_model_name, test_paths, document_type, expected_chunks + tmp_path, + tokenizer_model_name, + test_paths, + document_type, + expected_chunks, + force_cpu_on_macos_ci, ): """ Generalized test function for chunking documents. From 0a3eecd5d961c580f0cd2e79eeb8e8c558de2fff Mon Sep 17 00:00:00 2001 From: Cloud User Date: Tue, 11 Mar 2025 15:44:40 +0000 Subject: [PATCH 24/33] test: Add fixture to force CPU usage on macOS CI environments, disabling MPS Signed-off-by: Cloud User --- tests/functional/test_chunkers.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index e3be1bce..8a620c26 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -33,12 +33,32 @@ def tokenizer_model_name(): return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab") -@pytest.fixture(scope="module") +@pytest.fixture(scope="module", autouse=True) def force_cpu_on_macos_ci(): """Force CPU usage on macOS CI environments.""" - if os.getenv("CI") and sys.platform == "darwin": + is_ci = bool(os.getenv("CI")) # Convert to boolean explicitly + is_macos = sys.platform == "darwin" + print(f"::debug::CI environment: {is_ci}, Platform: {sys.platform}") + + if is_ci and is_macos: + print("::notice::Forcing CPU usage on macOS CI environment") + # Disable MPS torch.backends.mps.enabled = False + # Force CPU as default device + os.environ["PYTORCH_DEVICE"] = "cpu" + torch.set_default_device("cpu") + + # Ensure map_location is CPU for torch.load operations + def cpu_loader(storage, *args, **kwargs): + return storage.cpu() + + torch.load = lambda f, *a, **kw: torch._load_global( + f, map_location=cpu_loader, *a, **kw + ) + + yield + @pytest.mark.parametrize( "document_type, expected_chunks", @@ -53,7 +73,6 @@ def test_chunk_documents( test_paths, document_type, expected_chunks, - force_cpu_on_macos_ci, ): """ Generalized test function for chunking documents. From 755851c8ec4ec51b86da1f16be35c72459f9852b Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 11 Mar 2025 17:44:27 +0000 Subject: [PATCH 25/33] test: test debug logging and MPS handling in macOS CI environment fixture Signed-off-by: eshwarprasadS --- tests/functional/test_chunkers.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index 8a620c26..c3334c3e 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -2,6 +2,7 @@ from pathlib import Path import os import sys +import logging # Third Party import pytest @@ -10,6 +11,10 @@ # First Party from instructlab.sdg.utils.chunkers import DocumentChunker +# Set up logging +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger(__name__) + # Constants for Test Directory and Test Documents TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata") TEST_DOCUMENTS = { @@ -36,19 +41,20 @@ def tokenizer_model_name(): @pytest.fixture(scope="module", autouse=True) def force_cpu_on_macos_ci(): """Force CPU usage on macOS CI environments.""" + logger.debug("=== Starting CPU force fixture ===") is_ci = bool(os.getenv("CI")) # Convert to boolean explicitly is_macos = sys.platform == "darwin" - print(f"::debug::CI environment: {is_ci}, Platform: {sys.platform}") + logger.debug(f"CI environment: {is_ci}, Platform: {sys.platform}") if is_ci and is_macos: - print("::notice::Forcing CPU usage on macOS CI environment") + logger.info("Forcing CPU usage on macOS CI environment") # Disable MPS torch.backends.mps.enabled = False - + # Force CPU as default device os.environ["PYTORCH_DEVICE"] = "cpu" torch.set_default_device("cpu") - + # Ensure map_location is CPU for torch.load operations def cpu_loader(storage, *args, **kwargs): return storage.cpu() @@ -57,6 +63,15 @@ def cpu_loader(storage, *args, **kwargs): f, map_location=cpu_loader, *a, **kw ) + # Additional MPS disabling + os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu" + if hasattr(torch.mps, 'empty_cache'): + torch.mps.empty_cache() + + logger.debug(f"After setup - MPS enabled: {torch.backends.mps.enabled}") + logger.debug(f"Current device: {os.getenv('PYTORCH_DEVICE', 'not set')}") + logger.debug("=== Finished CPU force fixture ===") + yield From 16f6b9f3fe19c45c56dbedf8def5e0f9c2467352 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 11 Mar 2025 18:00:10 +0000 Subject: [PATCH 26/33] test: Reorder imports and minor formatting in macOS CI fixture Signed-off-by: eshwarprasadS --- tests/functional/test_chunkers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index c3334c3e..3eecbc5c 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -1,8 +1,8 @@ # Standard from pathlib import Path +import logging import os import sys -import logging # Third Party import pytest @@ -50,11 +50,11 @@ def force_cpu_on_macos_ci(): logger.info("Forcing CPU usage on macOS CI environment") # Disable MPS torch.backends.mps.enabled = False - + # Force CPU as default device os.environ["PYTORCH_DEVICE"] = "cpu" torch.set_default_device("cpu") - + # Ensure map_location is CPU for torch.load operations def cpu_loader(storage, *args, **kwargs): return storage.cpu() @@ -65,7 +65,7 @@ def cpu_loader(storage, *args, **kwargs): # Additional MPS disabling os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu" - if hasattr(torch.mps, 'empty_cache'): + if hasattr(torch.mps, "empty_cache"): torch.mps.empty_cache() logger.debug(f"After setup - MPS enabled: {torch.backends.mps.enabled}") From bde48a35c42de2e4947dfdb5cdf6b907db4b48f2 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 11 Mar 2025 18:19:37 +0000 Subject: [PATCH 27/33] test: Simplify macOS device handling in chunkers and test fixture Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/chunkers.py | 2 +- tests/functional/test_chunkers.py | 12 ++---------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index e7bdf820..796f3a0f 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -147,7 +147,7 @@ def _init_docling_converter(self): do_ocr=False, ) # deactivate MPS acceleration on Github CI - if os.getenv("CI") and sys.platform == "darwin": + if sys.platform == "darwin": pipeline_options.accelerator_options = AcceleratorOptions( device=AcceleratorDevice.CPU ) diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index 3eecbc5c..241557a0 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -42,15 +42,9 @@ def tokenizer_model_name(): def force_cpu_on_macos_ci(): """Force CPU usage on macOS CI environments.""" logger.debug("=== Starting CPU force fixture ===") - is_ci = bool(os.getenv("CI")) # Convert to boolean explicitly is_macos = sys.platform == "darwin" - logger.debug(f"CI environment: {is_ci}, Platform: {sys.platform}") - - if is_ci and is_macos: + if is_macos: logger.info("Forcing CPU usage on macOS CI environment") - # Disable MPS - torch.backends.mps.enabled = False - # Force CPU as default device os.environ["PYTORCH_DEVICE"] = "cpu" torch.set_default_device("cpu") @@ -65,11 +59,9 @@ def cpu_loader(storage, *args, **kwargs): # Additional MPS disabling os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu" - if hasattr(torch.mps, "empty_cache"): - torch.mps.empty_cache() - logger.debug(f"After setup - MPS enabled: {torch.backends.mps.enabled}") logger.debug(f"Current device: {os.getenv('PYTORCH_DEVICE', 'not set')}") + logger.debug(f"Available PyTorch devices: CPU={torch.cuda.is_available()}") logger.debug("=== Finished CPU force fixture ===") yield From cb5f4b134c14331332e51cebd72b317da24b3352 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 11 Mar 2025 18:37:28 +0000 Subject: [PATCH 28/33] refactor: Simplify MPS handling in macOS CI test fixture Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/chunkers.py | 1 - tests/functional/test_chunkers.py | 12 ++---------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 796f3a0f..359a2bdf 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -4,7 +4,6 @@ from typing import Dict, Iterable, List, Optional import json import logging -import os import re import sys diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index 241557a0..58e48cec 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -49,16 +49,8 @@ def force_cpu_on_macos_ci(): os.environ["PYTORCH_DEVICE"] = "cpu" torch.set_default_device("cpu") - # Ensure map_location is CPU for torch.load operations - def cpu_loader(storage, *args, **kwargs): - return storage.cpu() - - torch.load = lambda f, *a, **kw: torch._load_global( - f, map_location=cpu_loader, *a, **kw - ) - - # Additional MPS disabling - os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu" + # Disable MPS + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0" logger.debug(f"Current device: {os.getenv('PYTORCH_DEVICE', 'not set')}") logger.debug(f"Available PyTorch devices: CPU={torch.cuda.is_available()}") From 259bdb583334c566c9611f74808f98992c3a6445 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 11 Mar 2025 19:26:23 +0000 Subject: [PATCH 29/33] fix: add CI env check condition to disabling MPS Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/chunkers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 359a2bdf..e7bdf820 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -4,6 +4,7 @@ from typing import Dict, Iterable, List, Optional import json import logging +import os import re import sys @@ -146,7 +147,7 @@ def _init_docling_converter(self): do_ocr=False, ) # deactivate MPS acceleration on Github CI - if sys.platform == "darwin": + if os.getenv("CI") and sys.platform == "darwin": pipeline_options.accelerator_options = AcceleratorOptions( device=AcceleratorDevice.CPU ) From 0d5a749c6ce50ae62d79d1532e736831d47d11b8 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 11 Mar 2025 19:36:43 +0000 Subject: [PATCH 30/33] Revert latest commit to fix broken test Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/chunkers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index e7bdf820..359a2bdf 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -4,7 +4,6 @@ from typing import Dict, Iterable, List, Optional import json import logging -import os import re import sys @@ -147,7 +146,7 @@ def _init_docling_converter(self): do_ocr=False, ) # deactivate MPS acceleration on Github CI - if os.getenv("CI") and sys.platform == "darwin": + if sys.platform == "darwin": pipeline_options.accelerator_options = AcceleratorOptions( device=AcceleratorDevice.CPU ) From 728443e64a04b0f3e2cdd9a0b993419c546bf946 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Thu, 13 Mar 2025 17:37:38 +0000 Subject: [PATCH 31/33] ci: update test workflow to use macos-latest platform Signed-off-by: eshwarprasadS --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 50f868fb..e27eedfd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -50,9 +50,9 @@ jobs: - "ubuntu-latest" include: - python: "3.11" - platform: "macos-latest-xlarge" + platform: "macos-latest" - python: "3.12" - platform: "macos-latest-xlarge" + platform: "macos-latest" steps: - name: "Harden Runner" uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0 @@ -66,7 +66,7 @@ jobs: fetch-depth: 0 - name: Free disk space - if: matrix.platform != 'macos-latest-xlarge' + if: matrix.platform != 'macos-latest' uses: ./.github/actions/free-disk-space - name: Install the expect package From a33749b2d8ed0e3032d81e4af506a92c975d26c2 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Thu, 13 Mar 2025 18:14:53 +0000 Subject: [PATCH 32/33] ci: add CI environment variable to enable macOS MPS handling - Updated tox.ini to pass CI environment variable. - Modified DocumentChunker to check for CI environment before disabling MPS on macOS. Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/chunkers.py | 3 ++- tox.ini | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 359a2bdf..e7bdf820 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -4,6 +4,7 @@ from typing import Dict, Iterable, List, Optional import json import logging +import os import re import sys @@ -146,7 +147,7 @@ def _init_docling_converter(self): do_ocr=False, ) # deactivate MPS acceleration on Github CI - if sys.platform == "darwin": + if os.getenv("CI") and sys.platform == "darwin": pipeline_options.accelerator_options = AcceleratorOptions( device=AcceleratorDevice.CPU ) diff --git a/tox.ini b/tox.ini index cf912fd8..f3f9d829 100644 --- a/tox.ini +++ b/tox.ini @@ -12,6 +12,8 @@ description = run tests (unit, unitcov, functional) # are huge. This reduces venv from 5.7 GB to 1.5 GB. setenv = PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu +passenv = + CI package = wheel wheel_build_env = pkg deps = -r requirements-dev.txt From a8273f4007ff6cfb7ee372d89ae9c8153060c5fe Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Thu, 13 Mar 2025 21:44:51 +0000 Subject: [PATCH 33/33] refactor: Remove PDF extraction using docling parse, remove all references to docling parse Signed-off-by: eshwarprasadS --- requirements.txt | 1 - src/instructlab/sdg/utils/taxonomy.py | 104 ++++---------------------- 2 files changed, 15 insertions(+), 90 deletions(-) diff --git a/requirements.txt b/requirements.txt index 85c25711..356b7b7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ datasets>=2.18.0,<3.0.0 docling-core[chunking]>=2.9.0 docling[tesserocr]>=2.9.0; sys_platform != 'darwin' docling>=2.9.0; sys_platform == 'darwin' -docling-parse>=3.3.0 GitPython>=3.1.42,<4.0.0 gguf>=0.6.0 httpx>=0.25.0,<1.0.0 diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index 73c424af..c6a9799c 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -13,7 +13,6 @@ from datasets import Dataset # pylint: disable=no-name-in-module -from docling_parse.pdf_parsers import pdf_parser_v2 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS from instructlab.schema.taxonomy import ( TaxonomyMessageFormat, @@ -27,9 +26,6 @@ # Local from .chunkers import DocumentChunker -# Initialize the pdf parser -PDFParser = pdf_parser_v2("error") - logger = logging.getLogger(__name__) @@ -122,55 +118,13 @@ def _string_contains_html(s: str) -> bool: return bool(html_tag_pattern.search(s)) -def extract_text_from_pdf(file_path: str) -> str: - """ - Extracts text from a PDF file using the docling PdfParser API. - - Args: - file_path (str): Path to the PDF file. - - Returns: - str: The extracted text from all pages. - """ - pdf_text = "" - # Load the PDF document in lazy mode. (Do not pass a doc_key here.) - pdf_doc = PDFParser.load(path_or_stream=file_path, lazy=True) - if pdf_doc is None: - logger.error(f"Failed to load PDF: {file_path}") - return "" - - num_pages = pdf_doc.number_of_pages() - logger.info(f"PDF '{file_path}' has {num_pages} pages.") - - # Note: The high-level API expects page numbers to be 1-indexed. - for page_no in range(1, num_pages + 1): - try: - pdf_page = pdf_doc.get_page(page_no=page_no) - text_lines = pdf_page.sanitized.export_to_textlines( - add_fontkey=True, add_fontname=False - ) - page_text = "\n".join(text_lines) - pdf_text += page_text + "\n" - except Exception as e: # pylint: disable=broad-exception-caught - logger.warning( - f"Error extracting text from page {page_no} of '{file_path}': {e}" - ) - continue - - # Unload the document to free memory - pdf_doc.unload() - logger.info(f"Unloaded PDF document: {file_path}") - - return pdf_text - - def _get_documents( source: Dict[str, Union[str, List[str]]], skip_checkout: bool = False, document_output_dir: Path = None, -) -> Tuple[List[str], List[Path]]: +) -> Tuple[List[Path], List[Path]]: """ - Retrieve the content of files (Markdown and PDFs) from a Git repository. + Retrieve file paths (Markdown and PDFs) from a Git repository. Args: source (dict): Source info containing repository URL, commit hash, and list of file patterns. @@ -178,8 +132,8 @@ def _get_documents( document_output_dir (Path, optional): Directory to clone the repository into. Defaults to current directory. Returns: - Tuple[List[str], List[Path]]: - - List of document contents (Markdown as text and PDFs as extracted text). + Tuple[List[Path], List[Path]]: + - List of file paths for valid documents (Markdown and PDFs). - List of corresponding file paths. Raises: @@ -196,12 +150,10 @@ def _get_documents( if not skip_checkout and commit_hash: repo.git.checkout(commit_hash) - file_contents = [] filepaths = [] logger.info("Processing files...") for pattern in file_patterns: - # Use glob to find files matching the pattern matched_files = glob.glob( os.path.join(repo.working_dir, pattern), recursive=True ) @@ -211,37 +163,13 @@ def _get_documents( if os.path.isfile(file_path): logger.info(f"Processing file: {file_path}") try: - if file_path.lower().endswith(".md"): - # Process Markdown files - with open(file_path, "r", encoding="utf-8") as file: - content = file.read() - if _string_contains_html(content): - logging.warning( - f"Provided markdown file {file_path} contains HTML contents, which is currently unsupported as a part of markdown" - "NOTE: Continuing this might affect your data generation quality." - "To get best results please format your markdown documents without the use of HTML or use a different document filetype." - ) - file_contents.append(content) - filepaths.append(Path(file_path)) - logger.info( - f"Appended Markdown content from {file_path}" - ) - - elif file_path.lower().endswith(".pdf"): - # Process PDF files using docling_parse's pdf_parser_v2 - logger.info(f"Extracting text from PDF file: {file_path}") - pdf_text = extract_text_from_pdf(file_path) - if pdf_text: - file_contents.append(pdf_text) - filepaths.append(Path(file_path)) - else: - logger.warning( - f"PDF file {file_path} could not be processed" - ) - + if file_path.lower().endswith((".md", ".pdf")): + filepaths.append(Path(file_path)) + logger.info(f"Added file path: {file_path}") else: logger.info(f"Skipping unsupported file type: {file_path}") - except Exception as file_error: # pylint: disable=broad-exception-caught + # pylint: disable=broad-exception-caught + except Exception as file_error: logger.error( f"Error processing file '{file_path}': {file_error}" ) @@ -249,8 +177,8 @@ def _get_documents( else: logger.info(f"Skipping non-file path: {file_path}") - if file_contents: - return file_contents, filepaths + if filepaths: + return filepaths, filepaths raise SystemExit("Couldn't find knowledge documents") except (OSError, git.exc.GitCommandError, FileNotFoundError) as e: @@ -284,17 +212,17 @@ def _read_taxonomy_file( task_description = contents.get("task_description", None) domain = contents.get("domain") documents = contents.get("document") - document_contents, doc_filepaths = None, None + doc_filepaths, _ = None, None if documents: os.makedirs(document_output_dir, exist_ok=True) unique_output_dir = mkdtemp( prefix=f"{leaf_node_path}_", dir=document_output_dir ) - document_contents, doc_filepaths = _get_documents( + doc_filepaths, _ = _get_documents( source=documents, document_output_dir=unique_output_dir, ) - logger.debug("Content from git repo fetched") + logger.debug("File paths from git repo fetched") for seed_example in contents.get("seed_examples"): context = seed_example.get("context", "") @@ -305,7 +233,6 @@ def _read_taxonomy_file( "questions_and_answers": question_answer_list, "context": context, "taxonomy_path": tax_path, - "documents": document_contents, "filepaths": doc_filepaths, "domain": domain, "document_outline": contents.get("document_outline"), @@ -322,7 +249,6 @@ def _read_taxonomy_file( "output": answer, "taxonomy_path": tax_path, "task_description": task_description, - "document": documents, "domain": domain, } ) @@ -496,7 +422,7 @@ def leaf_node_to_samples( docling_model_path=None, ): samples = [] - if leaf_node and leaf_node[0].get("documents"): + if leaf_node and leaf_node[0].get("filepaths"): samples = _knowledge_leaf_node_to_samples( leaf_node, server_ctx_size,