From 67909180190e1d12ce9c1ed411b60e584548729e Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 12 Feb 2025 15:23:54 -0500
Subject: [PATCH 01/33] Update PDF extraction and OCR options for hybrid
 chunking

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 226 ++------------------------
 src/instructlab/sdg/utils/taxonomy.py | 101 ++++++------
 2 files changed, 63 insertions(+), 264 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 52c15ba1..2dc3a48f 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -8,9 +8,11 @@
 
 # Third Party
 from datasets import Dataset
+from docling.chunking import HybridChunker
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
     PdfPipelineOptions,
@@ -57,7 +59,10 @@ def resolve_ocr_options() -> OcrOptions:
         # Third Party
         from docling.models.easyocr_model import EasyOcrModel
 
-        _ = EasyOcrModel(True, ocr_options)
+        accelerator_options = AcceleratorOptions()
+        _ = EasyOcrModel(
+            True, ocr_options, None, accelerator_options=accelerator_options
+        )
         return ocr_options
     except ImportError:
         # no easyocr either, so don't use any OCR
@@ -185,11 +190,11 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
         with open(json_fp, "r", encoding="utf-8") as f:
             data = json.load(f)
 
-        chunks = self.build_chunks_from_docling_json(
-            data,
-            max_token_per_chunk=500,
-            tokenizer=self.tokenizer,
-        )
+        chunker = HybridChunker(tokenizer=self.tokenizer, max_token_per_chunk=500)
+        chunk_iter = chunker.chunk(
+            dl_doc=data
+        )  # Use hybrid chunker to chunk the document
+        chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter]
         fused_texts = self.fuse_texts(chunks, 200)
 
         num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
@@ -288,215 +293,6 @@ def get_token_count(self, text, tokenizer):
         """
         return len(tokenizer.tokenize(text))
 
-    def add_heading_formatting(self, text):
-        """
-        Add heading formatting to the text if the first part is short.
-        Args:
-            text (str): The input text to format.
-        Returns:
-            str: Formatted text with headings applied.
-        """
-        text = text.split(".")
-
-        # Change this from hardcoded to something more flexible
-        if len(text) > 1 and len(text[0].split(" ")) < 3:
-            text = f"**{text[0]}**" + ".".join(text[1:])
-        else:
-            text = ".".join(text)
-        return text
-
-    def generate_table_from_parsed_rep(self, item):
-        """
-        Generate the table from the parsed representation and return as a string.
-        Args:
-            item (dict): Parsed representation of a table.
-        Returns:
-            str: Formatted table as a string.
-        """
-        caption = ""
-        if "text" in item:
-            caption = item["text"]
-
-        data = item["data"]
-
-        if len(data) <= 1 or len(data[0]) <= 1:
-            return ""
-
-        table = []
-        for _, row in enumerate(data):
-            trow = []
-            for _, cell in enumerate(row):
-                trow.append(cell["text"])
-            table.append(trow)
-
-        table_text = tabulate(table, tablefmt="github")
-        if caption:
-            table_text += f"\nCaption: {caption}\n"
-        return table_text
-
-    def get_table(self, json_book, table_ref):
-        """
-        Retrieve a table from a document based on a reference string.
-        Args:
-            json_book (dict): JSON representation of the document.
-            table_ref (str): Reference path to the table within the document.
-        Returns:
-            str: Formatted table string.
-        """
-        parts = table_ref.split("/")
-        table_text = self.generate_table_from_parsed_rep(
-            json_book[parts[1]][int(parts[2])]
-        )
-        return table_text
-
-    def get_table_page_number(self, json_book, idx):
-        """
-        Get the page number of a table or other document element.
-        Args:
-            json_book (dict): JSON representation of the document.
-            idx (int): Index of the element in the document.
-        Returns:
-            int: Page number of the element.
-        """
-        prev_page_num, next_page_num = None, None
-        for book_element in json_book["main-text"][idx - 1 :: -1]:
-            if "prov" in book_element:
-                prev_page_num = book_element["prov"][0]["page"]
-                break
-        for book_element in json_book["main-text"][idx:]:
-            if "prov" in book_element and book_element["prov"]:
-                next_page_num = book_element["prov"][0]["page"]
-                break
-        if prev_page_num is not None and next_page_num is not None:
-            if prev_page_num == next_page_num:
-                return prev_page_num
-            return next_page_num
-        if prev_page_num is not None:
-            return prev_page_num
-        if next_page_num is not None:
-            return next_page_num
-
-    def build_chunks_from_docling_json(
-        self,
-        json_book,
-        max_token_per_chunk,
-        tokenizer,
-        keep_same_page_thing_together=False,
-        chunking_criteria=None,
-    ):
-        """
-        Build document chunks from a docling JSON representation.
-        Args:
-            json_book (dict): JSON document to process.
-            max_token_per_chunk (int): Maximum token count per chunk.
-            tokenizer (AutoTokenizer): Tokenizer instance to use.
-            keep_same_page_thing_together (bool): Whether to keep content on the same page together.
-            chunking_criteria (callable): Custom function for determining chunk breaks.
-        Returns:
-            list: List of document chunks.
-        """
-        current_buffer = []
-        document_chunks = []
-        prev_page_number = None
-        book_title = None
-
-        for idx, book_element in enumerate(json_book["main-text"]):
-            if book_element["type"] in [
-                "page-footer",
-                "picture",
-                "reference",
-                "meta-data",
-                "figure",
-                "page-header",
-            ]:
-                continue
-            if book_element["type"] == "footnote":
-                current_book_page_number = book_element["prov"][0]["page"]
-            elif book_element["type"] in [
-                "subtitle-level-1",
-                "paragraph",
-                "table",
-                "title",
-                "equation",
-            ]:  # 'page-header',
-                if book_element["type"] == "table":
-                    current_book_page_number = self.get_table_page_number(
-                        json_book, idx
-                    )
-                    book_text = self.get_table(json_book, book_element["$ref"])
-                elif book_element["prov"]:
-                    current_book_page_number = book_element["prov"][0][
-                        "page"
-                    ]  # TODO export to function to handle empty ["prov"]
-                    book_text = book_element["text"]
-                else:
-                    current_book_page_number = None
-                    book_text = book_element["text"]
-
-                if book_element["type"] == "subtitle-level-1":
-                    if book_title is None:
-                        book_title = book_text
-                        book_text = f"# Title: **{book_text}**"
-                    else:
-                        book_text = f"## **{book_text}**"
-
-                if book_element["type"] == "title":
-                    book_text = f"# **{book_text}**"
-                if book_element["type"] == "page-header":
-                    book_text = f"Page Header: **{book_text}**\n\n"
-
-                if chunking_criteria is not None:
-                    # custom break function that can be used to chunk document
-                    if chunking_criteria(book_text):
-                        document_chunks.append("\n\n".join(current_buffer))
-                        current_buffer = []
-                elif (
-                    prev_page_number is not None
-                    and prev_page_number != current_book_page_number
-                ) and keep_same_page_thing_together:
-                    document_chunks.append("\n\n".join(current_buffer))
-                    current_buffer = []
-                else:
-                    if (
-                        self.get_token_count("\n\n".join(current_buffer), tokenizer)
-                        >= max_token_per_chunk
-                        and len(current_buffer) > 1
-                    ):
-                        chunk_text = "\n\n".join(current_buffer[:-1])
-                        logger.debug(
-                            f"Current chunk size {self.get_token_count(chunk_text, tokenizer)} and max is {max_token_per_chunk}"
-                        )
-
-                        document_chunks.append("\n\n".join(current_buffer[:-1]))
-
-                        if (
-                            self.get_token_count(current_buffer[-1], tokenizer)
-                            >= max_token_per_chunk
-                        ):
-                            logger.debug(
-                                f"The following text was dropped from the document because it was too long to fit into a single context for synthetic data generation: {current_buffer[-1]}"
-                            )
-                            document_chunks.append(current_buffer[-1])
-                            current_buffer = []
-                        else:
-                            current_buffer = current_buffer[-1:]
-
-                if book_element["type"] == "paragraph":
-                    book_text = self.add_heading_formatting(book_text)
-                if "## References" in book_text or "## Acknowledgements" in book_text:
-                    # For research papers we ignore everything after this sections
-                    break
-                current_buffer.append(book_text)
-
-            try:
-                prev_page_number = current_book_page_number
-            except Exception as e:  # pylint: disable=broad-exception-caught
-                logger.error(f"Error processing book element: {book_element}, {str(e)}")
-
-        if "\n\n".join(current_buffer) not in document_chunks:
-            document_chunks.append("\n\n".join(current_buffer))
-        return document_chunks
-
     def export_documents(self, converted_docs: Iterable[ConversionResult]):
         """Write converted documents to json files
 
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index ed0d7940..2aec5b7a 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -13,7 +13,7 @@
 from datasets import Dataset
 
 # pylint: disable=no-name-in-module
-from docling_parse.docling_parse import pdf_parser_v1
+from docling_parse.pdf_parsers import pdf_parser_v2
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -28,7 +28,7 @@
 from .chunkers import DocumentChunker
 
 # Initialize the pdf parser
-PDFParser = pdf_parser_v1()
+PDFParser = pdf_parser_v2("error")
 
 logger = logging.getLogger(__name__)
 
@@ -122,13 +122,55 @@ def _string_contains_html(s: str) -> bool:
     return bool(html_tag_pattern.search(s))
 
 
+def extract_text_from_pdf(file_path: str) -> str:
+    """
+    Extracts text from a PDF file using the docling PdfParser API.
+
+    Args:
+        file_path (str): Path to the PDF file.
+
+    Returns:
+        str: The extracted text from all pages.
+    """
+    pdf_text = ""
+    # Load the PDF document in lazy mode. (Do not pass a doc_key here.)
+    pdf_doc = PDFParser.load(path_or_stream=file_path, lazy=True)
+    if pdf_doc is None:
+        logger.error(f"Failed to load PDF: {file_path}")
+        return ""
+
+    num_pages = pdf_doc.number_of_pages()
+    logger.info(f"PDF '{file_path}' has {num_pages} pages.")
+
+    # Note: The high-level API expects page numbers to be 1-indexed.
+    for page_no in range(1, num_pages + 1):
+        try:
+            pdf_page = pdf_doc.get_page(page_no=page_no)
+            text_lines = pdf_page.sanitized.export_to_textlines(
+                add_fontkey=True, add_fontname=False
+            )
+            page_text = "\n".join(text_lines)
+            pdf_text += page_text + "\n"
+        except Exception as e:
+            logger.warning(
+                f"Error extracting text from page {page_no} of '{file_path}': {e}"
+            )
+            continue
+
+    # Unload the document to free memory
+    PDFParser.unload_document(file_path)
+    logger.info(f"Unloaded PDF document: {file_path}")
+
+    return pdf_text
+
+
 def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
     document_output_dir: Path = None,
 ) -> Tuple[List[str], List[Path]]:
     """
-    Retrieve the content of files (Markdown and PDF) from a Git repository.
+    Retrieve the content of files (Markdown and PDFs) from a Git repository.
 
     Args:
         source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -186,55 +228,16 @@ def _get_documents(
                                 )
 
                         elif file_path.lower().endswith(".pdf"):
-                            # Process PDF files using docling_parse's pdf_parser_v1
-                            doc_key = f"key_{os.path.basename(file_path)}"  # Unique document key
-                            logger.info(f"Loading PDF document from {file_path}")
-
-                            success = PDFParser.load_document(doc_key, file_path)
-                            if not success:
-                                logger.warning(
-                                    f"Failed to load PDF document: {file_path}"
-                                )
-                                continue
-
-                            num_pages = PDFParser.number_of_pages(doc_key)
-                            logger.info(f"PDF '{file_path}' has {num_pages} pages.")
-
-                            pdf_text = ""
-
-                            for page in range(num_pages):
-                                try:
-                                    json_doc = PDFParser.parse_pdf_from_key_on_page(
-                                        doc_key, page
-                                    )
-                                    if "pages" not in json_doc or not json_doc["pages"]:
-                                        logger.warning(
-                                            f"Page {page + 1} could not be parsed in '{file_path}'"
-                                        )
-                                        continue
-
-                                    json_page = json_doc["pages"][0]
-
-                                    # Extract text from cells
-                                    for cell in json_page.get("cells", []):
-                                        text = cell.get("content", {}).get(
-                                            "rnormalized", ""
-                                        )
-                                        if text.strip():  # Only append non-empty text
-                                            pdf_text += text.strip() + "\n"
-                                except Exception as page_error:  # pylint: disable=broad-exception-caught
-                                    logger.warning(
-                                        f"Error parsing page {page + 1} of '{file_path}': {page_error}"
-                                    )
-                                    continue
-
+                            # Process PDF files using docling_parse's pdf_parser_v2
+                            logger.info(f"Extracting text from PDF file: {file_path}")
+                            pdf_text = extract_text_from_pdf(file_path)
                             if pdf_text:
                                 file_contents.append(pdf_text)
                                 filepaths.append(Path(file_path))
-
-                            # Unload the document to free memory
-                            PDFParser.unload_document(doc_key)
-                            logger.info(f"Unloaded PDF document: {file_path}")
+                            else:
+                                logger.warning(
+                                    f"PDF file {file_path} could not be processed"
+                                )
 
                         else:
                             logger.info(f"Skipping unsupported file type: {file_path}")

From 51fb86d04baee96412f377174430facd5f6e2693 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 12 Feb 2025 15:59:17 -0500
Subject: [PATCH 02/33] Update docling versions

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 requirements.txt                      | 6 +++---
 src/instructlab/sdg/utils/chunkers.py | 3 +--
 src/instructlab/sdg/utils/taxonomy.py | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ec3c012d..fbe526cf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
-docling[tesserocr]>=2.4.2,<=2.8.3; sys_platform != 'darwin'
-docling>=2.4.2,<=2.8.3; sys_platform == 'darwin'
-docling-parse>=2.0.0,<3.0.0
+docling[tesserocr]>=2.9.0; sys_platform != 'darwin'
+docling>=2.9.0; sys_platform == 'darwin'
+docling-parse>=3.3.0
 GitPython>=3.1.42,<4.0.0
 gguf>=0.6.0
 httpx>=0.25.0,<1.0.0
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 2dc3a48f..f3983713 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -19,7 +19,6 @@
     TesseractOcrOptions,
 )
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
-from tabulate import tabulate
 
 # First Party
 from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors
@@ -190,7 +189,7 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
         with open(json_fp, "r", encoding="utf-8") as f:
             data = json.load(f)
 
-        chunker = HybridChunker(tokenizer=self.tokenizer, max_token_per_chunk=500)
+        chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500)
         chunk_iter = chunker.chunk(
             dl_doc=data
         )  # Use hybrid chunker to chunk the document
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 2aec5b7a..b8bc8de6 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -151,7 +151,7 @@ def extract_text_from_pdf(file_path: str) -> str:
             )
             page_text = "\n".join(text_lines)
             pdf_text += page_text + "\n"
-        except Exception as e:
+        except Exception as e:  # pylint: disable=broad-exception-caught
             logger.warning(
                 f"Error extracting text from page {page_no} of '{file_path}': {e}"
             )

From 90a7a4b6a2f4eaf7ca950db63b64a933cf0f3e52 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 12 Feb 2025 16:16:08 -0500
Subject: [PATCH 03/33] Update easyocr params

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index f3983713..4774b710 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -60,7 +60,7 @@ def resolve_ocr_options() -> OcrOptions:
 
         accelerator_options = AcceleratorOptions()
         _ = EasyOcrModel(
-            True, ocr_options, None, accelerator_options=accelerator_options
+            True, None, ocr_options, accelerator_options=accelerator_options
         )
         return ocr_options
     except ImportError:

From 19ba945935f35e46c1c6f99c477bb9a6b3673cf9 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 12 Feb 2025 16:27:06 -0500
Subject: [PATCH 04/33] Add docling-core[chunking] to requirements

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index fbe526cf..85c25711 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
+docling-core[chunking]>=2.9.0
 docling[tesserocr]>=2.9.0; sys_platform != 'darwin'
 docling>=2.9.0; sys_platform == 'darwin'
 docling-parse>=3.3.0

From def01eaeabd089bb35052820516a234813f6a261 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Thu, 13 Feb 2025 11:04:35 -0500
Subject: [PATCH 05/33] Update accelerator options for easy ocr

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 44 +++++++++++++++++++--------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 4774b710..bd57caf0 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -12,6 +12,7 @@
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
     AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
@@ -36,7 +37,12 @@ def _num_chars_from_tokens(num_tokens) -> int:
     return int(num_tokens * 4)  # 1 token ~ 4 English character
 
 
-def resolve_ocr_options() -> OcrOptions:
+def resolve_ocr_options(
+    docling_model_path: Optional[Path] = None,
+) -> Optional[OcrOptions]:
+    # Declare ocr_options explicitly as Optional[OcrOptions]
+    ocr_options: Optional[OcrOptions] = None
+
     # First, attempt to use tesserocr
     try:
         ocr_options = TesseractOcrOptions()
@@ -48,19 +54,30 @@ def resolve_ocr_options() -> OcrOptions:
         return ocr_options
     except ImportError:
         # No tesserocr, so try something else
-        pass
+        logger.warning("Tesseract not found, falling back to EasyOCR.")
+
     try:
-        ocr_options = EasyOcrOptions()
+        ocr_options = EasyOcrOptions(
+            lang=["en"],
+            use_gpu=False,
+            confidence_threshold=0.5,
+            model_storage_directory=str(docling_model_path),
+            recog_network="standard",
+            download_enabled=True,
+        )
         # Keep easyocr models on the CPU instead of GPU
         ocr_options.use_gpu = False
+        accelerator_options = AcceleratorOptions(device="cpu")
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
         from docling.models.easyocr_model import EasyOcrModel
 
-        accelerator_options = AcceleratorOptions()
         _ = EasyOcrModel(
-            True, None, ocr_options, accelerator_options=accelerator_options
+            enabled=True,
+            artifacts_path=None,
+            options=ocr_options,
+            accelerator_options=accelerator_options,
         )
         return ocr_options
     except ImportError:
@@ -131,7 +148,7 @@ def _init_docling_converter(self):
             do_ocr=False,
         )
 
-        ocr_options = resolve_ocr_options()
+        ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path)
         if ocr_options is not None:
             pipeline_options.do_ocr = True
             pipeline_options.ocr_options = ocr_options
@@ -190,10 +207,13 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
             data = json.load(f)
 
         chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500)
-        chunk_iter = chunker.chunk(
-            dl_doc=data
-        )  # Use hybrid chunker to chunk the document
-        chunks = [chunker.serialize_chunk(chunk) for chunk in chunk_iter]
+
+        try:
+            chunk_iter = chunker.chunk(dl_doc=data)
+            chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter]
+        except Exception as e:
+            logger.error(f"Error chunking document {json_fp}: {e}")
+
         fused_texts = self.fuse_texts(chunks, 200)
 
         num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
@@ -317,11 +337,11 @@ def export_documents(self, converted_docs: Iterable[ConversionResult]):
 
                 # Export Deep Search document JSON format:
                 with (docling_artifacts_path / f"{doc_filename}.json").open("w") as fp:
-                    fp.write(json.dumps(doc.legacy_document.export_to_dict()))
+                    fp.write(json.dumps(doc.document.export_to_dict()))
 
                 # Export Markdown format:
                 with (docling_artifacts_path / f"{doc_filename}.md").open("w") as fp:
-                    fp.write(doc.legacy_document.export_to_markdown())
+                    fp.write(doc.document.export_to_markdown())
             else:
                 logger.info(f"Document {doc.input.file} failed to convert.")
                 failure_count += 1

From 635171cfb627642811e0bd2c149c077f5ff108a8 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 06:05:03 -0500
Subject: [PATCH 06/33] Update the pdf doc parser unload function for v2 parser

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/taxonomy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index b8bc8de6..73c424af 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -158,7 +158,7 @@ def extract_text_from_pdf(file_path: str) -> str:
             continue
 
     # Unload the document to free memory
-    PDFParser.unload_document(file_path)
+    pdf_doc.unload()
     logger.info(f"Unloaded PDF document: {file_path}")
 
     return pdf_text

From 89bda7877441857d698cf7f370c5147ed82a8e7c Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 06:16:53 -0500
Subject: [PATCH 07/33] Update exception and initialize chunks

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index bd57caf0..9f3edb0f 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -12,7 +12,6 @@
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
-    AcceleratorDevice,
     AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
@@ -207,11 +206,12 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
             data = json.load(f)
 
         chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500)
+        chunks = []
 
         try:
             chunk_iter = chunker.chunk(dl_doc=data)
             chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter]
-        except Exception as e:
+        except Exception as e:  # pylint: disable=broad-exception-caught
             logger.error(f"Error chunking document {json_fp}: {e}")
 
         fused_texts = self.fuse_texts(chunks, 200)

From 7b6f051b43cde2f57037b7b333748725214df7be Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 07:39:15 -0500
Subject: [PATCH 08/33] Update chunk documents to avoid xporting to JSON and
 re-reading

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 55 +++++++++------------------
 1 file changed, 18 insertions(+), 37 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 9f3edb0f..c716b791 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -165,19 +165,27 @@ def chunk_documents(self) -> List:
             List: a list of chunks from the documents
         """
 
-        if self.document_paths == []:
-            return []
-
         parsed_documents = self.converter.convert_all(self.document_paths)
+        all_chunks = []
+        for conversion_result in parsed_documents:
+            doc = conversion_result.document
+            chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500)
+            try:
+                chunk_iter = chunker.chunk(dl_doc=doc)
+                chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter]
+            except Exception as e:
+                logger.error(
+                    f"Error chunking document {conversion_result.input.file}: {e}"
+                )
+                chunks = []
 
-        docling_artifacts_path = self.export_documents(parsed_documents)
-
-        docling_json_paths = list(docling_artifacts_path.glob("*.json"))
-        chunks = []
-        for json_fp in docling_json_paths:
-            chunks.extend(self._process_parsed_docling_json(json_fp))
+            fused_texts = self.fuse_texts(chunks, 200)
+            num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
+            chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
+            final_chunks = chunk_markdowns(fused_texts, chunk_size)
+            all_chunks.extend(final_chunks)
 
-        return chunks
+        return all_chunks
 
     def _path_validator(self, path) -> Path:
         """
@@ -193,33 +201,6 @@ def _path_validator(self, path) -> Path:
                 raise FileNotFoundError(f"{path} does not exist.")
         return path
 
-    def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
-        """
-        Process the parsed docling json file and return a dataset.
-        Args:
-            json_fp (Path): Path to the parsed docling json file.
-        Returns:
-            List: a list of chunks built from the provided json file
-        """
-        logger.info(f"Processing parsed docling json file: {json_fp}")
-        with open(json_fp, "r", encoding="utf-8") as f:
-            data = json.load(f)
-
-        chunker = HybridChunker(tokenizer=self.tokenizer, max_tokens=500)
-        chunks = []
-
-        try:
-            chunk_iter = chunker.chunk(dl_doc=data)
-            chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter]
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            logger.error(f"Error chunking document {json_fp}: {e}")
-
-        fused_texts = self.fuse_texts(chunks, 200)
-
-        num_tokens_per_doc = _num_tokens_from_words(self.chunk_word_count)
-        chunk_size = _num_chars_from_tokens(num_tokens_per_doc)
-        return chunk_markdowns(fused_texts, chunk_size)
-
     def fuse_texts(
         self, text_list: List, short_length_threshold: int = 130
     ) -> List[str]:

From f6c0eb7b6464235c4857aaceb860f40264764595 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 08:06:54 -0500
Subject: [PATCH 09/33] Update tests for chunk documents, check for empty
 chunks

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 tests/functional/test_chunkers.py | 37 ++++++++++++++++++++++++++++---
 1 file changed, 34 insertions(+), 3 deletions(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index f06db504..d3b14fbe 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -32,10 +32,10 @@ def tokenizer_model_name():
 
 
 @pytest.mark.parametrize(
-    "doc_type, expected_chunks, contains_text",
+    "doc_type, expected_chunks",
     [
-        ("pdf", 9, "Phoenix is a minor constellation"),
-        ("md", 7, None),  # Assuming there's no specific text to check in Markdown
+        ("pdf", 9),
+        ("md", 7),  # Assuming there's no specific text to check in Markdown
     ],
 )
 def test_chunk_documents(
@@ -58,3 +58,34 @@ def test_chunk_documents(
         assert contains_text in chunks[0]
     for chunk in chunks:
         assert len(chunk) < 2500
+
+
+def test_chunk_documents(
+    tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks
+):
+    """
+    Generalized test function for chunking documents.
+    Instead of checking for a particular text, we verify that:
+      - The number of chunks is greater than a minimum expected value.
+      - No chunk is empty.
+      - Each chunk is within a reasonable length.
+    """
+    document_path = test_paths[doc_type]
+    chunker = DocumentChunker(
+        document_paths=[document_path],
+        output_dir=tmp_path,
+        tokenizer_model_name=tokenizer_model_name,
+        server_ctx_size=4096,
+        chunk_word_count=500,
+    )
+    chunks = chunker.chunk_documents()
+
+    # Check that we have more chunks than expected
+    assert (
+        len(chunks) > expected_chunks
+    ), f"Expected more than {expected_chunks} chunks, got {len(chunks)}"
+
+    # Check that no chunk is empty and each chunk is within the max allowed length (2500 characters)
+    for chunk in chunks:
+        assert chunk, "Chunk should not be empty"
+        assert len(chunk) < 2500, f"Chunk length {len(chunk)} exceeds maximum allowed"

From d1b7e31d31e171106e2ab56df4ca84a779eea8e0 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 11:18:22 -0500
Subject: [PATCH 10/33] Update import for chunking

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index c716b791..a33f032b 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -8,7 +8,6 @@
 
 # Third Party
 from datasets import Dataset
-from docling.chunking import HybridChunker
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
@@ -18,6 +17,7 @@
     PdfPipelineOptions,
     TesseractOcrOptions,
 )
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
 
 # First Party

From 1f1cfd352bb55e76ae502d5c2d3e8fc72ac5c665 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 11:46:46 -0500
Subject: [PATCH 11/33] Adding transformers and semchunk because module
 requires chunking extra

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 85c25711..ce24e735 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,8 +12,10 @@ instructlab-schema>=0.4.0
 jinja2>=3.0.0
 langchain-text-splitters
 openai>=1.13.3,<2.0.0
+semchunk >= 2.2.0
 sentencepiece>=0.2.0
 tabulate>=0.9.0
+transformers >=4.34.0
 
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.

From f1477b78acbb57d189bb39788c9cf8e0e2bd3b8c Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 11:57:07 -0500
Subject: [PATCH 12/33] Take transformers out of the block from leanimports

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 tests/testdata/leanimports.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testdata/leanimports.py b/tests/testdata/leanimports.py
index f6a50484..48a3bfe8 100644
--- a/tests/testdata/leanimports.py
+++ b/tests/testdata/leanimports.py
@@ -4,7 +4,7 @@
 import sys
 
 # block slow imports
-for unwanted in ["deepspeed", "llama_cpp", "torch", "transformers", "vllm"]:
+for unwanted in ["deepspeed", "llama_cpp", "torch", "vllm"]:
     # importlib raises ModuleNotFound when sys.modules value is None.
     assert unwanted not in sys.modules
     sys.modules[unwanted] = None  # type: ignore[assignment]

From 5c28c5e2d2159505d1c6797072c0b478411a05cf Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 12:11:16 -0500
Subject: [PATCH 13/33] Update test_chunkers to update functional tests

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 tests/functional/test_chunkers.py | 41 ++++++++-----------------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index d3b14fbe..2924653b 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -25,52 +25,31 @@ def test_paths():
     }
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def tokenizer_model_name():
     """Fixture to return the path to the tokenizer model."""
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
 @pytest.mark.parametrize(
-    "doc_type, expected_chunks",
+    "document_type, expected_chunks",
     [
         ("pdf", 9),
-        ("md", 7),  # Assuming there's no specific text to check in Markdown
+        ("md", 7),
     ],
 )
 def test_chunk_documents(
-    tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks, contains_text
+    tmp_path, tokenizer_model_name, test_paths, document_type, expected_chunks
 ):
     """
     Generalized test function for chunking documents.
-    """
-    document_path = test_paths[doc_type]
-    chunker = DocumentChunker(
-        document_paths=[document_path],
-        output_dir=tmp_path,
-        tokenizer_model_name=tokenizer_model_name,
-        server_ctx_size=4096,
-        chunk_word_count=500,
-    )
-    chunks = chunker.chunk_documents()
-    assert len(chunks) > expected_chunks
-    if contains_text:
-        assert contains_text in chunks[0]
-    for chunk in chunks:
-        assert len(chunk) < 2500
-
 
-def test_chunk_documents(
-    tmp_path, tokenizer_model_name, test_paths, doc_type, expected_chunks
-):
-    """
-    Generalized test function for chunking documents.
-    Instead of checking for a particular text, we verify that:
-      - The number of chunks is greater than a minimum expected value.
+    Verifies that:
+      - The number of chunks is greater than the expected minimum.
       - No chunk is empty.
-      - Each chunk is within a reasonable length.
+      - Each chunk's length is less than 2500 characters.
     """
-    document_path = test_paths[doc_type]
+    document_path = test_paths[document_type]
     chunker = DocumentChunker(
         document_paths=[document_path],
         output_dir=tmp_path,
@@ -80,12 +59,12 @@ def test_chunk_documents(
     )
     chunks = chunker.chunk_documents()
 
-    # Check that we have more chunks than expected
+    # Check that we have more chunks than expected.
     assert (
         len(chunks) > expected_chunks
     ), f"Expected more than {expected_chunks} chunks, got {len(chunks)}"
 
-    # Check that no chunk is empty and each chunk is within the max allowed length (2500 characters)
+    # Check that no chunk is empty and each chunk's length is within the allowed limit.
     for chunk in chunks:
         assert chunk, "Chunk should not be empty"
         assert len(chunk) < 2500, f"Chunk length {len(chunk)} exceeds maximum allowed"

From f24e852030105f61f6a8a718c891a5d1815b0cbc Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Tue, 25 Feb 2025 13:22:01 -0500
Subject: [PATCH 14/33] Update lean imports to not run into OOM error

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
Co-authored-by: Courtney Pacheco <cpacheco@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 2 +-
 tests/testdata/leanimports.py         | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index a33f032b..22fb94d7 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -173,7 +173,7 @@ def chunk_documents(self) -> List:
             try:
                 chunk_iter = chunker.chunk(dl_doc=doc)
                 chunks = [chunker.serialize(chunk=chunk) for chunk in chunk_iter]
-            except Exception as e:
+            except Exception as e:  # pylint: disable=broad-exception-caught
                 logger.error(
                     f"Error chunking document {conversion_result.input.file}: {e}"
                 )
diff --git a/tests/testdata/leanimports.py b/tests/testdata/leanimports.py
index 48a3bfe8..c8f0504f 100644
--- a/tests/testdata/leanimports.py
+++ b/tests/testdata/leanimports.py
@@ -9,6 +9,13 @@
     assert unwanted not in sys.modules
     sys.modules[unwanted] = None  # type: ignore[assignment]
 
+# Try to import in your PR to see if this works around the issue. If not, print an error
+try:
+    # Third Party
+    import docling_core
+except ImportError as e:
+    print(f"Could not import `docling_core` because: {e}")
+
 # First Party
 # This will trigger errors if any of the import chain tries to load
 # the unwanted modules

From c9878c1693c373b8cc58f1bda49194122fb4c477 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Wed, 26 Feb 2025 02:14:11 -0500
Subject: [PATCH 15/33] Update src/instructlab/sdg/utils/chunkers.py

Deactivate MPS acceleration on Github CI to avoid OOM error

Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 22fb94d7..f5ac01dd 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -4,13 +4,16 @@
 from typing import Dict, Iterable, List, Optional
 import json
 import logging
+import os
 import re
+import sys
 
 # Third Party
 from datasets import Dataset
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
     AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
@@ -146,7 +149,11 @@ def _init_docling_converter(self):
             artifacts_path=self.docling_model_path,
             do_ocr=False,
         )
-
+        # deactivate MPS acceleration on Github CI
+        if os.getenv("CI") and sys.platform == "darwin":
+            pipeline_options.accelerator_options = AcceleratorOptions(
+                device=AcceleratorDevice.CPU
+            )
         ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path)
         if ocr_options is not None:
             pipeline_options.do_ocr = True

From 30bd3b5fadbf17f4eef0c78be6773aa46a4e58b2 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Thu, 27 Feb 2025 09:58:40 -0500
Subject: [PATCH 16/33] Update use_gpu to None for easy ocr

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index f5ac01dd..0b33410f 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -68,7 +68,7 @@ def resolve_ocr_options(
             download_enabled=True,
         )
         # Keep easyocr models on the CPU instead of GPU
-        ocr_options.use_gpu = False
+        ocr_options.use_gpu = None
         accelerator_options = AcceleratorOptions(device="cpu")
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel

From 0376926caca2fb6db78b5de43b3fed035c7b67c2 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Thu, 27 Feb 2025 10:20:47 -0500
Subject: [PATCH 17/33] Increase from 1.7 to a larger value to avoid the
 PyTorch MPS backend running OOM

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
Co-authored-by: Courtney Pacheco <cpacheco@redhat.com>
---
 .github/workflows/test.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a59b6d47..e27eedfd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -107,6 +107,9 @@ jobs:
       - name: Run unit and functional tests with tox
         run: |
           tox
+        env:
+          # Increase from 1.7 to a greater value to avoid the PyTorch MPS backend running OOM.
+          PYTORCH_MPS_HIGH_WATERMARK_RATIO: 2.0
 
       - name: Remove llama-cpp-python from cache
         if: always()

From 7709f0c4dcdce57720303ec9173bf943660fdca8 Mon Sep 17 00:00:00 2001
From: Aakanksha Duggal <aduggal@redhat.com>
Date: Thu, 27 Feb 2025 19:56:51 -0500
Subject: [PATCH 18/33] Remove docling core test from lean imports, add
 transformers back and import semchunk and transformers in chunkers.py

Signed-off-by: Aakanksha Duggal <aduggal@redhat.com>
---
 src/instructlab/sdg/utils/chunkers.py | 2 ++
 tests/testdata/leanimports.py         | 9 +--------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 0b33410f..e879e4f0 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -22,6 +22,8 @@
 )
 from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
+import semchunk
+import transformers
 
 # First Party
 from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors
diff --git a/tests/testdata/leanimports.py b/tests/testdata/leanimports.py
index c8f0504f..f6a50484 100644
--- a/tests/testdata/leanimports.py
+++ b/tests/testdata/leanimports.py
@@ -4,18 +4,11 @@
 import sys
 
 # block slow imports
-for unwanted in ["deepspeed", "llama_cpp", "torch", "vllm"]:
+for unwanted in ["deepspeed", "llama_cpp", "torch", "transformers", "vllm"]:
     # importlib raises ModuleNotFound when sys.modules value is None.
     assert unwanted not in sys.modules
     sys.modules[unwanted] = None  # type: ignore[assignment]
 
-# Try to import in your PR to see if this works around the issue. If not, print an error
-try:
-    # Third Party
-    import docling_core
-except ImportError as e:
-    print(f"Could not import `docling_core` because: {e}")
-
 # First Party
 # This will trigger errors if any of the import chain tries to load
 # the unwanted modules

From ec99a89e539bfb2f7169a570f9dde83663839f5e Mon Sep 17 00:00:00 2001
From: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
Date: Tue, 11 Mar 2025 07:19:27 +0000
Subject: [PATCH 19/33] fix: Move docling_core import inside method to avoid
 top-level transformers import

Signed-off-by: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
---
 src/instructlab/sdg/utils/chunkers.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index e879e4f0..e7bdf820 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -20,10 +20,7 @@
     PdfPipelineOptions,
     TesseractOcrOptions,
 )
-from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
-import semchunk
-import transformers
 
 # First Party
 from instructlab.sdg.utils.model_formats import is_model_gguf, is_model_safetensors
@@ -63,14 +60,12 @@ def resolve_ocr_options(
     try:
         ocr_options = EasyOcrOptions(
             lang=["en"],
-            use_gpu=False,
+            use_gpu=None,
             confidence_threshold=0.5,
             model_storage_directory=str(docling_model_path),
             recog_network="standard",
             download_enabled=True,
         )
-        # Keep easyocr models on the CPU instead of GPU
-        ocr_options.use_gpu = None
         accelerator_options = AcceleratorOptions(device="cpu")
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
@@ -173,6 +168,10 @@ def chunk_documents(self) -> List:
         Returns:
             List: a list of chunks from the documents
         """
+        # Move docling_core import inside method where it's used to avoid importing transformers at top level
+        # pylint: disable=import-outside-toplevel
+        # Third Party
+        from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
 
         parsed_documents = self.converter.convert_all(self.document_paths)
         all_chunks = []

From 09e7b09ec98b6d2ea7da89763d005d13c50916b3 Mon Sep 17 00:00:00 2001
From: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
Date: Tue, 11 Mar 2025 07:48:21 +0000
Subject: [PATCH 20/33] fix: Remove semchunk and transformers from
 requirements.txt

Signed-off-by: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
---
 requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ce24e735..85c25711 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,10 +12,8 @@ instructlab-schema>=0.4.0
 jinja2>=3.0.0
 langchain-text-splitters
 openai>=1.13.3,<2.0.0
-semchunk >= 2.2.0
 sentencepiece>=0.2.0
 tabulate>=0.9.0
-transformers >=4.34.0
 
 # Note: this dependency goes along with langchain-text-splitters and may be
 #       removed once that one is removed.

From bd3488b91e6fd1430dd4a80a2ca1b6c414f8ddcf Mon Sep 17 00:00:00 2001
From: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
Date: Tue, 11 Mar 2025 08:09:33 +0000
Subject: [PATCH 21/33] ci: Update GitHub Actions workflow to use
 macos-latest-xlarge runners

Signed-off-by: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e27eedfd..928fb826 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -50,9 +50,9 @@ jobs:
           - "ubuntu-latest"
         include:
           - python: "3.11"
-            platform: "macos-latest"
+            platform: "macos-latest-xlarge"
           - python: "3.12"
-            platform: "macos-latest"
+            platform: "macos-latest-xlarge"
     steps:
       - name: "Harden Runner"
         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0

From 7c11b3ca7ab77d249d3724900fd92129e422acd3 Mon Sep 17 00:00:00 2001
From: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
Date: Tue, 11 Mar 2025 08:20:19 +0000
Subject: [PATCH 22/33] ci: Update GitHub Actions workflow free disk space
 condition to reflect mac-os-latest-xlarge platform

Signed-off-by: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 928fb826..50f868fb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -66,7 +66,7 @@ jobs:
           fetch-depth: 0
 
       - name: Free disk space
-        if: matrix.platform != 'macos-latest'
+        if: matrix.platform != 'macos-latest-xlarge'
         uses: ./.github/actions/free-disk-space
 
       - name: Install the expect package

From 2cdb25e9f310cb1bf24d1e64f6745c5a7a478d2c Mon Sep 17 00:00:00 2001
From: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
Date: Tue, 11 Mar 2025 14:49:43 +0000
Subject: [PATCH 23/33] test: Add fixture to force CPU usage on macOS CI
 environments

Signed-off-by: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
---
 tests/functional/test_chunkers.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index 2924653b..e3be1bce 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -1,9 +1,11 @@
 # Standard
 from pathlib import Path
 import os
+import sys
 
 # Third Party
 import pytest
+import torch
 
 # First Party
 from instructlab.sdg.utils.chunkers import DocumentChunker
@@ -31,6 +33,13 @@ def tokenizer_model_name():
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
+@pytest.fixture(scope="module")
+def force_cpu_on_macos_ci():
+    """Force CPU usage on macOS CI environments."""
+    if os.getenv("CI") and sys.platform == "darwin":
+        torch.backends.mps.enabled = False
+
+
 @pytest.mark.parametrize(
     "document_type, expected_chunks",
     [
@@ -39,7 +48,12 @@ def tokenizer_model_name():
     ],
 )
 def test_chunk_documents(
-    tmp_path, tokenizer_model_name, test_paths, document_type, expected_chunks
+    tmp_path,
+    tokenizer_model_name,
+    test_paths,
+    document_type,
+    expected_chunks,
+    force_cpu_on_macos_ci,
 ):
     """
     Generalized test function for chunking documents.

From 0a3eecd5d961c580f0cd2e79eeb8e8c558de2fff Mon Sep 17 00:00:00 2001
From: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
Date: Tue, 11 Mar 2025 15:44:40 +0000
Subject: [PATCH 24/33] test: Add fixture to force CPU usage on macOS CI
 environments, disabling MPS

Signed-off-by: Cloud User <ec2-user@ip-172-31-44-225.ec2.internal>
---
 tests/functional/test_chunkers.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index e3be1bce..8a620c26 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -33,12 +33,32 @@ def tokenizer_model_name():
     return os.path.join(TEST_DATA_DIR, "models/instructlab/granite-7b-lab")
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture(scope="module", autouse=True)
 def force_cpu_on_macos_ci():
     """Force CPU usage on macOS CI environments."""
-    if os.getenv("CI") and sys.platform == "darwin":
+    is_ci = bool(os.getenv("CI"))  # Convert to boolean explicitly
+    is_macos = sys.platform == "darwin"
+    print(f"::debug::CI environment: {is_ci}, Platform: {sys.platform}")
+
+    if is_ci and is_macos:
+        print("::notice::Forcing CPU usage on macOS CI environment")
+        # Disable MPS
         torch.backends.mps.enabled = False
 
+        # Force CPU as default device
+        os.environ["PYTORCH_DEVICE"] = "cpu"
+        torch.set_default_device("cpu")
+
+        # Ensure map_location is CPU for torch.load operations
+        def cpu_loader(storage, *args, **kwargs):
+            return storage.cpu()
+
+        torch.load = lambda f, *a, **kw: torch._load_global(
+            f, map_location=cpu_loader, *a, **kw
+        )
+
+    yield
+
 
 @pytest.mark.parametrize(
     "document_type, expected_chunks",
@@ -53,7 +73,6 @@ def test_chunk_documents(
     test_paths,
     document_type,
     expected_chunks,
-    force_cpu_on_macos_ci,
 ):
     """
     Generalized test function for chunking documents.

From 755851c8ec4ec51b86da1f16be35c72459f9852b Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 11 Mar 2025 17:44:27 +0000
Subject: [PATCH 25/33] test: test debug logging and MPS handling in macOS CI
 environment fixture

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 tests/functional/test_chunkers.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index 8a620c26..c3334c3e 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -2,6 +2,7 @@
 from pathlib import Path
 import os
 import sys
+import logging
 
 # Third Party
 import pytest
@@ -10,6 +11,10 @@
 # First Party
 from instructlab.sdg.utils.chunkers import DocumentChunker
 
+# Set up logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
 # Constants for Test Directory and Test Documents
 TEST_DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "testdata")
 TEST_DOCUMENTS = {
@@ -36,19 +41,20 @@ def tokenizer_model_name():
 @pytest.fixture(scope="module", autouse=True)
 def force_cpu_on_macos_ci():
     """Force CPU usage on macOS CI environments."""
+    logger.debug("=== Starting CPU force fixture ===")
     is_ci = bool(os.getenv("CI"))  # Convert to boolean explicitly
     is_macos = sys.platform == "darwin"
-    print(f"::debug::CI environment: {is_ci}, Platform: {sys.platform}")
+    logger.debug(f"CI environment: {is_ci}, Platform: {sys.platform}")
 
     if is_ci and is_macos:
-        print("::notice::Forcing CPU usage on macOS CI environment")
+        logger.info("Forcing CPU usage on macOS CI environment")
         # Disable MPS
         torch.backends.mps.enabled = False
-
+        
         # Force CPU as default device
         os.environ["PYTORCH_DEVICE"] = "cpu"
         torch.set_default_device("cpu")
-
+        
         # Ensure map_location is CPU for torch.load operations
         def cpu_loader(storage, *args, **kwargs):
             return storage.cpu()
@@ -57,6 +63,15 @@ def cpu_loader(storage, *args, **kwargs):
             f, map_location=cpu_loader, *a, **kw
         )
 
+        # Additional MPS disabling
+        os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu"
+        if hasattr(torch.mps, 'empty_cache'):
+            torch.mps.empty_cache()
+
+    logger.debug(f"After setup - MPS enabled: {torch.backends.mps.enabled}")
+    logger.debug(f"Current device: {os.getenv('PYTORCH_DEVICE', 'not set')}")
+    logger.debug("=== Finished CPU force fixture ===")
+
     yield
 
 

From 16f6b9f3fe19c45c56dbedf8def5e0f9c2467352 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 11 Mar 2025 18:00:10 +0000
Subject: [PATCH 26/33] test: Reorder imports and minor formatting in macOS CI
 fixture

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 tests/functional/test_chunkers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index c3334c3e..3eecbc5c 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -1,8 +1,8 @@
 # Standard
 from pathlib import Path
+import logging
 import os
 import sys
-import logging
 
 # Third Party
 import pytest
@@ -50,11 +50,11 @@ def force_cpu_on_macos_ci():
         logger.info("Forcing CPU usage on macOS CI environment")
         # Disable MPS
         torch.backends.mps.enabled = False
-        
+
         # Force CPU as default device
         os.environ["PYTORCH_DEVICE"] = "cpu"
         torch.set_default_device("cpu")
-        
+
         # Ensure map_location is CPU for torch.load operations
         def cpu_loader(storage, *args, **kwargs):
             return storage.cpu()
@@ -65,7 +65,7 @@ def cpu_loader(storage, *args, **kwargs):
 
         # Additional MPS disabling
         os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu"
-        if hasattr(torch.mps, 'empty_cache'):
+        if hasattr(torch.mps, "empty_cache"):
             torch.mps.empty_cache()
 
     logger.debug(f"After setup - MPS enabled: {torch.backends.mps.enabled}")

From bde48a35c42de2e4947dfdb5cdf6b907db4b48f2 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 11 Mar 2025 18:19:37 +0000
Subject: [PATCH 27/33] test: Simplify macOS device handling in chunkers and
 test fixture

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/chunkers.py |  2 +-
 tests/functional/test_chunkers.py     | 12 ++----------
 2 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index e7bdf820..796f3a0f 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -147,7 +147,7 @@ def _init_docling_converter(self):
             do_ocr=False,
         )
         # deactivate MPS acceleration on Github CI
-        if os.getenv("CI") and sys.platform == "darwin":
+        if sys.platform == "darwin":
             pipeline_options.accelerator_options = AcceleratorOptions(
                 device=AcceleratorDevice.CPU
             )
diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index 3eecbc5c..241557a0 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -42,15 +42,9 @@ def tokenizer_model_name():
 def force_cpu_on_macos_ci():
     """Force CPU usage on macOS CI environments."""
     logger.debug("=== Starting CPU force fixture ===")
-    is_ci = bool(os.getenv("CI"))  # Convert to boolean explicitly
     is_macos = sys.platform == "darwin"
-    logger.debug(f"CI environment: {is_ci}, Platform: {sys.platform}")
-
-    if is_ci and is_macos:
+    if is_macos:
         logger.info("Forcing CPU usage on macOS CI environment")
-        # Disable MPS
-        torch.backends.mps.enabled = False
-
         # Force CPU as default device
         os.environ["PYTORCH_DEVICE"] = "cpu"
         torch.set_default_device("cpu")
@@ -65,11 +59,9 @@ def cpu_loader(storage, *args, **kwargs):
 
         # Additional MPS disabling
         os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu"
-        if hasattr(torch.mps, "empty_cache"):
-            torch.mps.empty_cache()
 
-    logger.debug(f"After setup - MPS enabled: {torch.backends.mps.enabled}")
     logger.debug(f"Current device: {os.getenv('PYTORCH_DEVICE', 'not set')}")
+    logger.debug(f"Available PyTorch devices: CPU={torch.cuda.is_available()}")
     logger.debug("=== Finished CPU force fixture ===")
 
     yield

From cb5f4b134c14331332e51cebd72b317da24b3352 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 11 Mar 2025 18:37:28 +0000
Subject: [PATCH 28/33] refactor: Simplify MPS handling in macOS CI test
 fixture

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/chunkers.py |  1 -
 tests/functional/test_chunkers.py     | 12 ++----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 796f3a0f..359a2bdf 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -4,7 +4,6 @@
 from typing import Dict, Iterable, List, Optional
 import json
 import logging
-import os
 import re
 import sys
 
diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index 241557a0..58e48cec 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -49,16 +49,8 @@ def force_cpu_on_macos_ci():
         os.environ["PYTORCH_DEVICE"] = "cpu"
         torch.set_default_device("cpu")
 
-        # Ensure map_location is CPU for torch.load operations
-        def cpu_loader(storage, *args, **kwargs):
-            return storage.cpu()
-
-        torch.load = lambda f, *a, **kw: torch._load_global(
-            f, map_location=cpu_loader, *a, **kw
-        )
-
-        # Additional MPS disabling
-        os.environ["PYTORCH_MPS_ALLOCATOR_POLICY"] = "cpu"
+        # Disable MPS
+        os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
 
     logger.debug(f"Current device: {os.getenv('PYTORCH_DEVICE', 'not set')}")
     logger.debug(f"Available PyTorch devices: CPU={torch.cuda.is_available()}")

From 259bdb583334c566c9611f74808f98992c3a6445 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 11 Mar 2025 19:26:23 +0000
Subject: [PATCH 29/33] fix: add CI env check condition to disabling MPS

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/chunkers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 359a2bdf..e7bdf820 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -4,6 +4,7 @@
 from typing import Dict, Iterable, List, Optional
 import json
 import logging
+import os
 import re
 import sys
 
@@ -146,7 +147,7 @@ def _init_docling_converter(self):
             do_ocr=False,
         )
         # deactivate MPS acceleration on Github CI
-        if sys.platform == "darwin":
+        if os.getenv("CI") and sys.platform == "darwin":
             pipeline_options.accelerator_options = AcceleratorOptions(
                 device=AcceleratorDevice.CPU
             )

From 0d5a749c6ce50ae62d79d1532e736831d47d11b8 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 11 Mar 2025 19:36:43 +0000
Subject: [PATCH 30/33] Revert latest commit to fix broken test

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/chunkers.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index e7bdf820..359a2bdf 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -4,7 +4,6 @@
 from typing import Dict, Iterable, List, Optional
 import json
 import logging
-import os
 import re
 import sys
 
@@ -147,7 +146,7 @@ def _init_docling_converter(self):
             do_ocr=False,
         )
         # deactivate MPS acceleration on Github CI
-        if os.getenv("CI") and sys.platform == "darwin":
+        if sys.platform == "darwin":
             pipeline_options.accelerator_options = AcceleratorOptions(
                 device=AcceleratorDevice.CPU
             )

From 728443e64a04b0f3e2cdd9a0b993419c546bf946 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Thu, 13 Mar 2025 17:37:38 +0000
Subject: [PATCH 31/33] ci: update test workflow to use macos-latest platform

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 .github/workflows/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 50f868fb..e27eedfd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -50,9 +50,9 @@ jobs:
           - "ubuntu-latest"
         include:
           - python: "3.11"
-            platform: "macos-latest-xlarge"
+            platform: "macos-latest"
           - python: "3.12"
-            platform: "macos-latest-xlarge"
+            platform: "macos-latest"
     steps:
       - name: "Harden Runner"
         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
@@ -66,7 +66,7 @@ jobs:
           fetch-depth: 0
 
       - name: Free disk space
-        if: matrix.platform != 'macos-latest-xlarge'
+        if: matrix.platform != 'macos-latest'
         uses: ./.github/actions/free-disk-space
 
       - name: Install the expect package

From a33749b2d8ed0e3032d81e4af506a92c975d26c2 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Thu, 13 Mar 2025 18:14:53 +0000
Subject: [PATCH 32/33] ci: add CI environment variable to enable macOS MPS
 handling

- Updated tox.ini to pass CI environment variable.
- Modified DocumentChunker to check for CI environment before disabling MPS on macOS.

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/chunkers.py | 3 ++-
 tox.ini                               | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 359a2bdf..e7bdf820 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -4,6 +4,7 @@
 from typing import Dict, Iterable, List, Optional
 import json
 import logging
+import os
 import re
 import sys
 
@@ -146,7 +147,7 @@ def _init_docling_converter(self):
             do_ocr=False,
         )
         # deactivate MPS acceleration on Github CI
-        if sys.platform == "darwin":
+        if os.getenv("CI") and sys.platform == "darwin":
             pipeline_options.accelerator_options = AcceleratorOptions(
                 device=AcceleratorDevice.CPU
             )
diff --git a/tox.ini b/tox.ini
index cf912fd8..f3f9d829 100644
--- a/tox.ini
+++ b/tox.ini
@@ -12,6 +12,8 @@ description = run tests (unit, unitcov, functional)
 # are huge. This reduces venv from 5.7 GB to 1.5 GB.
 setenv =
     PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
+passenv =
+    CI
 package = wheel
 wheel_build_env = pkg
 deps = -r requirements-dev.txt

From a8273f4007ff6cfb7ee372d89ae9c8153060c5fe Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Thu, 13 Mar 2025 21:44:51 +0000
Subject: [PATCH 33/33] refactor: Remove PDF extraction using docling parse,
 remove all references to docling parse

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 requirements.txt                      |   1 -
 src/instructlab/sdg/utils/taxonomy.py | 104 ++++----------------------
 2 files changed, 15 insertions(+), 90 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 85c25711..356b7b7e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,6 @@ datasets>=2.18.0,<3.0.0
 docling-core[chunking]>=2.9.0
 docling[tesserocr]>=2.9.0; sys_platform != 'darwin'
 docling>=2.9.0; sys_platform == 'darwin'
-docling-parse>=3.3.0
 GitPython>=3.1.42,<4.0.0
 gguf>=0.6.0
 httpx>=0.25.0,<1.0.0
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index 73c424af..c6a9799c 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -13,7 +13,6 @@
 from datasets import Dataset
 
 # pylint: disable=no-name-in-module
-from docling_parse.pdf_parsers import pdf_parser_v2
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -27,9 +26,6 @@
 # Local
 from .chunkers import DocumentChunker
 
-# Initialize the pdf parser
-PDFParser = pdf_parser_v2("error")
-
 logger = logging.getLogger(__name__)
 
 
@@ -122,55 +118,13 @@ def _string_contains_html(s: str) -> bool:
     return bool(html_tag_pattern.search(s))
 
 
-def extract_text_from_pdf(file_path: str) -> str:
-    """
-    Extracts text from a PDF file using the docling PdfParser API.
-
-    Args:
-        file_path (str): Path to the PDF file.
-
-    Returns:
-        str: The extracted text from all pages.
-    """
-    pdf_text = ""
-    # Load the PDF document in lazy mode. (Do not pass a doc_key here.)
-    pdf_doc = PDFParser.load(path_or_stream=file_path, lazy=True)
-    if pdf_doc is None:
-        logger.error(f"Failed to load PDF: {file_path}")
-        return ""
-
-    num_pages = pdf_doc.number_of_pages()
-    logger.info(f"PDF '{file_path}' has {num_pages} pages.")
-
-    # Note: The high-level API expects page numbers to be 1-indexed.
-    for page_no in range(1, num_pages + 1):
-        try:
-            pdf_page = pdf_doc.get_page(page_no=page_no)
-            text_lines = pdf_page.sanitized.export_to_textlines(
-                add_fontkey=True, add_fontname=False
-            )
-            page_text = "\n".join(text_lines)
-            pdf_text += page_text + "\n"
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            logger.warning(
-                f"Error extracting text from page {page_no} of '{file_path}': {e}"
-            )
-            continue
-
-    # Unload the document to free memory
-    pdf_doc.unload()
-    logger.info(f"Unloaded PDF document: {file_path}")
-
-    return pdf_text
-
-
 def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
     document_output_dir: Path = None,
-) -> Tuple[List[str], List[Path]]:
+) -> Tuple[List[Path], List[Path]]:
     """
-    Retrieve the content of files (Markdown and PDFs) from a Git repository.
+    Retrieve file paths (Markdown and PDFs) from a Git repository.
 
     Args:
         source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -178,8 +132,8 @@ def _get_documents(
         document_output_dir (Path, optional): Directory to clone the repository into. Defaults to current directory.
 
     Returns:
-        Tuple[List[str], List[Path]]:
-            - List of document contents (Markdown as text and PDFs as extracted text).
+        Tuple[List[Path], List[Path]]:
+            - List of file paths for valid documents (Markdown and PDFs).
             - List of corresponding file paths.
 
     Raises:
@@ -196,12 +150,10 @@ def _get_documents(
         if not skip_checkout and commit_hash:
             repo.git.checkout(commit_hash)
 
-        file_contents = []
         filepaths = []
 
         logger.info("Processing files...")
         for pattern in file_patterns:
-            # Use glob to find files matching the pattern
             matched_files = glob.glob(
                 os.path.join(repo.working_dir, pattern), recursive=True
             )
@@ -211,37 +163,13 @@ def _get_documents(
                 if os.path.isfile(file_path):
                     logger.info(f"Processing file: {file_path}")
                     try:
-                        if file_path.lower().endswith(".md"):
-                            # Process Markdown files
-                            with open(file_path, "r", encoding="utf-8") as file:
-                                content = file.read()
-                                if _string_contains_html(content):
-                                    logging.warning(
-                                        f"Provided markdown file {file_path} contains HTML contents, which is currently unsupported as a part of markdown"
-                                        "NOTE: Continuing this might affect your data generation quality."
-                                        "To get best results please format your markdown documents without the use of HTML or use a different document filetype."
-                                    )
-                                file_contents.append(content)
-                                filepaths.append(Path(file_path))
-                                logger.info(
-                                    f"Appended Markdown content from {file_path}"
-                                )
-
-                        elif file_path.lower().endswith(".pdf"):
-                            # Process PDF files using docling_parse's pdf_parser_v2
-                            logger.info(f"Extracting text from PDF file: {file_path}")
-                            pdf_text = extract_text_from_pdf(file_path)
-                            if pdf_text:
-                                file_contents.append(pdf_text)
-                                filepaths.append(Path(file_path))
-                            else:
-                                logger.warning(
-                                    f"PDF file {file_path} could not be processed"
-                                )
-
+                        if file_path.lower().endswith((".md", ".pdf")):
+                            filepaths.append(Path(file_path))
+                            logger.info(f"Added file path: {file_path}")
                         else:
                             logger.info(f"Skipping unsupported file type: {file_path}")
-                    except Exception as file_error:  # pylint: disable=broad-exception-caught
+                    # pylint: disable=broad-exception-caught
+                    except Exception as file_error:
                         logger.error(
                             f"Error processing file '{file_path}': {file_error}"
                         )
@@ -249,8 +177,8 @@ def _get_documents(
                 else:
                     logger.info(f"Skipping non-file path: {file_path}")
 
-        if file_contents:
-            return file_contents, filepaths
+        if filepaths:
+            return filepaths, filepaths
         raise SystemExit("Couldn't find knowledge documents")
 
     except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
@@ -284,17 +212,17 @@ def _read_taxonomy_file(
         task_description = contents.get("task_description", None)
         domain = contents.get("domain")
         documents = contents.get("document")
-        document_contents, doc_filepaths = None, None
+        doc_filepaths, _ = None, None
         if documents:
             os.makedirs(document_output_dir, exist_ok=True)
             unique_output_dir = mkdtemp(
                 prefix=f"{leaf_node_path}_", dir=document_output_dir
             )
-            document_contents, doc_filepaths = _get_documents(
+            doc_filepaths, _ = _get_documents(
                 source=documents,
                 document_output_dir=unique_output_dir,
             )
-            logger.debug("Content from git repo fetched")
+            logger.debug("File paths from git repo fetched")
 
         for seed_example in contents.get("seed_examples"):
             context = seed_example.get("context", "")
@@ -305,7 +233,6 @@ def _read_taxonomy_file(
                         "questions_and_answers": question_answer_list,
                         "context": context,
                         "taxonomy_path": tax_path,
-                        "documents": document_contents,
                         "filepaths": doc_filepaths,
                         "domain": domain,
                         "document_outline": contents.get("document_outline"),
@@ -322,7 +249,6 @@ def _read_taxonomy_file(
                         "output": answer,
                         "taxonomy_path": tax_path,
                         "task_description": task_description,
-                        "document": documents,
                         "domain": domain,
                     }
                 )
@@ -496,7 +422,7 @@ def leaf_node_to_samples(
     docling_model_path=None,
 ):
     samples = []
-    if leaf_node and leaf_node[0].get("documents"):
+    if leaf_node and leaf_node[0].get("filepaths"):
         samples = _knowledge_leaf_node_to_samples(
             leaf_node,
             server_ctx_size,