instructlab · eshwarprasadS · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 19, 2025
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
-docling[tesserocr]>=2.4.2,<=2.8.3; sys_platform != 'darwin'
-docling>=2.4.2,<=2.8.3; sys_platform == 'darwin'
-docling-parse>=2.0.0,<3.0.0
+docling[tesserocr]>=2.18.0; sys_platform != 'darwin'
+docling>=2.18.0; sys_platform == 'darwin'
 GitPython>=3.1.42,<4.0.0
 gguf>=0.6.0
 httpx>=0.25.0,<1.0.0

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -4,13 +4,17 @@
 from typing import Dict, Iterable, List, Optional
 import json
 import logging
+import os
 import re
+import sys
 
 # Third Party
 from datasets import Dataset
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
     PdfPipelineOptions,
@@ -35,29 +39,50 @@ def _num_chars_from_tokens(num_tokens) -> int:
     return int(num_tokens * 4)  # 1 token ~ 4 English character
 
 
-def resolve_ocr_options() -> OcrOptions:
+def resolve_ocr_options(
+    docling_model_path: Optional[Path] = None,
+) -> Optional[OcrOptions]:
+    # Declare ocr_options explicitly as Optional[OcrOptions]
+    ocr_options: Optional[OcrOptions] = None
+
     # First, attempt to use tesserocr
     try:
         ocr_options = TesseractOcrOptions()
         # pylint: disable=import-outside-toplevel
         # Third Party
         from docling.models.tesseract_ocr_model import TesseractOcrModel
 
-        _ = TesseractOcrModel(True, ocr_options)
+        _ = TesseractOcrModel(
+            enabled=True,
+            artifacts_path=docling_model_path,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # No tesserocr, so try something else
-        pass
+        logger.warning("Tesseract not found, falling back to EasyOCR.")
+
     try:
-        ocr_options = EasyOcrOptions()
-        # Keep easyocr models on the CPU instead of GPU
-        ocr_options.use_gpu = False
+        ocr_options = EasyOcrOptions(
+            lang=["en"],
+            use_gpu=None,
+            confidence_threshold=0.5,
+            model_storage_directory=str(docling_model_path),
+            recog_network="standard",
+            download_enabled=True,
+        )
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
         from docling.models.easyocr_model import EasyOcrModel
 
-        _ = EasyOcrModel(True, ocr_options)
+        _ = EasyOcrModel(
+            enabled=True,
+            artifacts_path=None,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # no easyocr either, so don't use any OCR
@@ -127,7 +152,12 @@ def _init_docling_converter(self):
             do_ocr=False,
         )
 
-        ocr_options = resolve_ocr_options()
+        # deactivate MPS acceleration on Github CI
+        if os.getenv("CI") and sys.platform == "darwin":
+            pipeline_options.accelerator_options = AcceleratorOptions(
+                device=AcceleratorDevice.CPU
+            )
+        ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path)
         if ocr_options is not None:
             pipeline_options.do_ocr = True
             pipeline_options.ocr_options = ocr_options

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -3,17 +3,14 @@
 # Standard
 from pathlib import Path
 from tempfile import mkdtemp
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Union
 import glob
 import logging
 import os
 import re
 
 # Third Party
 from datasets import Dataset
-
-# pylint: disable=no-name-in-module
-from docling_parse.docling_parse import pdf_parser_v1
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -27,9 +24,6 @@
 # Local
 from .chunkers import DocumentChunker
 
-# Initialize the pdf parser
-PDFParser = pdf_parser_v1()
-
 logger = logging.getLogger(__name__)
 
 
@@ -126,9 +120,9 @@ def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
     document_output_dir: Path = None,
-) -> Tuple[List[str], List[Path]]:
+) -> List[Path]:
     """
-    Retrieve the content of files (Markdown and PDF) from a Git repository.
+    Retrieve the file paths of files (Markdown and PDF) from a Git repository.
 
     Args:
         source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -147,14 +141,13 @@ def _get_documents(
     repo_url = source.get("repo")
     commit_hash = source.get("commit")
     file_patterns = source.get("patterns", [])
-
-    try:  # pylint: disable=too-many-nested-blocks
+    # pylint: disable=too-many-nested-blocks
+    try:
         repo = git.Repo.clone_from(repo_url, document_output_dir)
 
         if not skip_checkout and commit_hash:
             repo.git.checkout(commit_hash)
 
-        file_contents = []
         filepaths = []
 
         logger.info("Processing files...")
@@ -170,7 +163,6 @@ def _get_documents(
                     logger.info(f"Processing file: {file_path}")
                     try:
                         if file_path.lower().endswith(".md"):
-                            # Process Markdown files
                             with open(file_path, "r", encoding="utf-8") as file:
                                 content = file.read()
                                 if _string_contains_html(content):
@@ -179,75 +171,19 @@ def _get_documents(
                                         "NOTE: Continuing this might affect your data generation quality."
                                         "To get best results please format your markdown documents without the use of HTML or use a different document filetype."
                                     )
-                                file_contents.append(content)
-                                filepaths.append(Path(file_path))
-                                logger.info(
-                                    f"Appended Markdown content from {file_path}"
-                                )
-
-                        elif file_path.lower().endswith(".pdf"):
-                            # Process PDF files using docling_parse's pdf_parser_v1
-                            doc_key = f"key_{os.path.basename(file_path)}"  # Unique document key
-                            logger.info(f"Loading PDF document from {file_path}")
-
-                            success = PDFParser.load_document(doc_key, file_path)
-                            if not success:
-                                logger.warning(
-                                    f"Failed to load PDF document: {file_path}"
-                                )
-                                continue
-
-                            num_pages = PDFParser.number_of_pages(doc_key)
-                            logger.info(f"PDF '{file_path}' has {num_pages} pages.")
-
-                            pdf_text = ""
-
-                            for page in range(num_pages):
-                                try:
-                                    json_doc = PDFParser.parse_pdf_from_key_on_page(
-                                        doc_key, page
-                                    )
-                                    if "pages" not in json_doc or not json_doc["pages"]:
-                                        logger.warning(
-                                            f"Page {page + 1} could not be parsed in '{file_path}'"
-                                        )
-                                        continue
-
-                                    json_page = json_doc["pages"][0]
-
-                                    # Extract text from cells
-                                    for cell in json_page.get("cells", []):
-                                        text = cell.get("content", {}).get(
-                                            "rnormalized", ""
-                                        )
-                                        if text.strip():  # Only append non-empty text
-                                            pdf_text += text.strip() + "\n"
-                                except Exception as page_error:  # pylint: disable=broad-exception-caught
-                                    logger.warning(
-                                        f"Error parsing page {page + 1} of '{file_path}': {page_error}"
-                                    )
-                                    continue
-
-                            if pdf_text:
-                                file_contents.append(pdf_text)
-                                filepaths.append(Path(file_path))
-
-                            # Unload the document to free memory
-                            PDFParser.unload_document(doc_key)
-                            logger.info(f"Unloaded PDF document: {file_path}")
-
-                        else:
-                            logger.info(f"Skipping unsupported file type: {file_path}")
-                    except Exception as file_error:  # pylint: disable=broad-exception-caught
+                        filepaths.append(Path(file_path))
+                        logger.info(f"Collected filepath: {file_path}")
+                    # pylint: disable=broad-exception-caught
+                    except Exception as file_error:
                         logger.error(
                             f"Error processing file '{file_path}': {file_error}"
                         )
                         continue
                 else:
                     logger.info(f"Skipping non-file path: {file_path}")
 
-        if file_contents:
-            return file_contents, filepaths
+        if filepaths:
+            return filepaths
         raise SystemExit("Couldn't find knowledge documents")
 
     except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
@@ -281,13 +217,13 @@ def _read_taxonomy_file(
         task_description = contents.get("task_description", None)
         domain = contents.get("domain")
         documents = contents.get("document")
-        document_contents, doc_filepaths = None, None
+        doc_filepaths = None
         if documents:
             os.makedirs(document_output_dir, exist_ok=True)
             unique_output_dir = mkdtemp(
                 prefix=f"{leaf_node_path}_", dir=document_output_dir
             )
-            document_contents, doc_filepaths = _get_documents(
+            doc_filepaths = _get_documents(
                 source=documents,
                 document_output_dir=unique_output_dir,
             )
@@ -302,7 +238,6 @@ def _read_taxonomy_file(
                         "questions_and_answers": question_answer_list,
                         "context": context,
                         "taxonomy_path": tax_path,
-                        "documents": document_contents,
                         "filepaths": doc_filepaths,
                         "domain": domain,
                         "document_outline": contents.get("document_outline"),
@@ -493,7 +428,8 @@ def leaf_node_to_samples(
     docling_model_path=None,
 ):
     samples = []
-    if leaf_node and leaf_node[0].get("documents"):
+    # check if the leaf node has document filepaths, if so, it's a knowledge leaf node
+    if leaf_node and (leaf_node[0].get("filepaths")):
         samples = _knowledge_leaf_node_to_samples(
             leaf_node,
             server_ctx_size,

diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
@@ -55,6 +55,9 @@ def test_chunk_documents(
     chunks = chunker.chunk_documents()
     assert len(chunks) > expected_chunks
     if contains_text:
-        assert contains_text in chunks[0]
+        # Normalize spaces and remove newlines for more flexible text comparison
+        normalized_chunk = " ".join(chunks[0].replace("\n", " ").split())
+        normalized_text = " ".join(contains_text.split())
+        assert normalized_text in normalized_chunk
     for chunk in chunks:
         assert len(chunk) < 2500