From 84f4c7fecf43e2437f15492239e431c3b9f8437f Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 18 Mar 2025 19:30:41 +0000
Subject: [PATCH 1/7] feat: update docling requirements and improve OCR options
 handling with bumped ver.

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 requirements.txt                      |  5 +-
 src/instructlab/sdg/utils/chunkers.py | 44 +++++++++++---
 src/instructlab/sdg/utils/taxonomy.py | 87 ++++-----------------------
 tests/functional/test_chunkers.py     |  5 +-
 tox.ini                               |  2 +
 5 files changed, 55 insertions(+), 88 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ec3c012d..aceefdbc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 click>=8.1.7,<9.0.0
 datasets>=2.18.0,<3.0.0
-docling[tesserocr]>=2.4.2,<=2.8.3; sys_platform != 'darwin'
-docling>=2.4.2,<=2.8.3; sys_platform == 'darwin'
-docling-parse>=2.0.0,<3.0.0
+docling[tesserocr]>=2.18.0; sys_platform != 'darwin'
+docling>=2.18.0; sys_platform == 'darwin'
 GitPython>=3.1.42,<4.0.0
 gguf>=0.6.0
 httpx>=0.25.0,<1.0.0
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index e256c969..febee1ba 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -5,12 +5,16 @@
 import json
 import logging
 import re
+import os
+import sys
 
 # Third Party
 from datasets import Dataset
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
+    AcceleratorDevice,
+    AcceleratorOptions,
     EasyOcrOptions,
     OcrOptions,
     PdfPipelineOptions,
@@ -35,7 +39,12 @@ def _num_chars_from_tokens(num_tokens) -> int:
     return int(num_tokens * 4)  # 1 token ~ 4 English character
 
 
-def resolve_ocr_options() -> OcrOptions:
+def resolve_ocr_options(
+        docling_model_path: Optional[Path] = None,
+) -> OcrOptions:
+    # Declare ocr_options explicitly as Optional[OcrOptions]
+    ocr_options: Optional[OcrOptions] = None
+    
     # First, attempt to use tesserocr
     try:
         ocr_options = TesseractOcrOptions()
@@ -43,21 +52,35 @@ def resolve_ocr_options() -> OcrOptions:
         # Third Party
         from docling.models.tesseract_ocr_model import TesseractOcrModel
 
-        _ = TesseractOcrModel(True, ocr_options)
+        _ = TesseractOcrModel(enabled=True, 
+                              artifacts_path=docling_model_path, 
+                              options=ocr_options, 
+                              accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU))
         return ocr_options
     except ImportError:
         # No tesserocr, so try something else
-        pass
+        logger.warning("Tesseract not found, falling back to EasyOCR.")
+    
     try:
-        ocr_options = EasyOcrOptions()
-        # Keep easyocr models on the CPU instead of GPU
-        ocr_options.use_gpu = False
+        ocr_options = EasyOcrOptions(
+            lang=["en"],
+            use_gpu=None,
+            confidence_threshold=0.5,
+            model_storage_directory=str(docling_model_path),
+            recog_network="standard",
+            download_enabled=True,
+        )
         # triggers torch loading, import lazily
         # pylint: disable=import-outside-toplevel
         # Third Party
         from docling.models.easyocr_model import EasyOcrModel
 
-        _ = EasyOcrModel(True, ocr_options)
+        _ = EasyOcrModel(
+            enabled=True,
+            artifacts_path=None,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # no easyocr either, so don't use any OCR
@@ -127,7 +150,12 @@ def _init_docling_converter(self):
             do_ocr=False,
         )
 
-        ocr_options = resolve_ocr_options()
+        # deactivate MPS acceleration on Github CI
+        if os.getenv("CI") and sys.platform == "darwin":
+            pipeline_options.accelerator_options = AcceleratorOptions(
+                device=AcceleratorDevice.CPU
+            )
+        ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path)
         if ocr_options is not None:
             pipeline_options.do_ocr = True
             pipeline_options.ocr_options = ocr_options
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index ed0d7940..d10fcd81 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -12,8 +12,6 @@
 # Third Party
 from datasets import Dataset
 
-# pylint: disable=no-name-in-module
-from docling_parse.docling_parse import pdf_parser_v1
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -27,9 +25,6 @@
 # Local
 from .chunkers import DocumentChunker
 
-# Initialize the pdf parser
-PDFParser = pdf_parser_v1()
-
 logger = logging.getLogger(__name__)
 
 
@@ -126,9 +121,9 @@ def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
     document_output_dir: Path = None,
-) -> Tuple[List[str], List[Path]]:
+) -> Tuple[List[Path]]:
     """
-    Retrieve the content of files (Markdown and PDF) from a Git repository.
+    Retrieve the file paths of files (Markdown and PDF) from a Git repository.
 
     Args:
         source (dict): Source info containing repository URL, commit hash, and list of file patterns.
@@ -148,13 +143,12 @@ def _get_documents(
     commit_hash = source.get("commit")
     file_patterns = source.get("patterns", [])
 
-    try:  # pylint: disable=too-many-nested-blocks
+    try:
         repo = git.Repo.clone_from(repo_url, document_output_dir)
 
         if not skip_checkout and commit_hash:
             repo.git.checkout(commit_hash)
 
-        file_contents = []
         filepaths = []
 
         logger.info("Processing files...")
@@ -170,7 +164,6 @@ def _get_documents(
                     logger.info(f"Processing file: {file_path}")
                     try:
                         if file_path.lower().endswith(".md"):
-                            # Process Markdown files
                             with open(file_path, "r", encoding="utf-8") as file:
                                 content = file.read()
                                 if _string_contains_html(content):
@@ -179,66 +172,9 @@ def _get_documents(
                                         "NOTE: Continuing this might affect your data generation quality."
                                         "To get best results please format your markdown documents without the use of HTML or use a different document filetype."
                                     )
-                                file_contents.append(content)
-                                filepaths.append(Path(file_path))
-                                logger.info(
-                                    f"Appended Markdown content from {file_path}"
-                                )
-
-                        elif file_path.lower().endswith(".pdf"):
-                            # Process PDF files using docling_parse's pdf_parser_v1
-                            doc_key = f"key_{os.path.basename(file_path)}"  # Unique document key
-                            logger.info(f"Loading PDF document from {file_path}")
-
-                            success = PDFParser.load_document(doc_key, file_path)
-                            if not success:
-                                logger.warning(
-                                    f"Failed to load PDF document: {file_path}"
-                                )
-                                continue
-
-                            num_pages = PDFParser.number_of_pages(doc_key)
-                            logger.info(f"PDF '{file_path}' has {num_pages} pages.")
-
-                            pdf_text = ""
-
-                            for page in range(num_pages):
-                                try:
-                                    json_doc = PDFParser.parse_pdf_from_key_on_page(
-                                        doc_key, page
-                                    )
-                                    if "pages" not in json_doc or not json_doc["pages"]:
-                                        logger.warning(
-                                            f"Page {page + 1} could not be parsed in '{file_path}'"
-                                        )
-                                        continue
-
-                                    json_page = json_doc["pages"][0]
-
-                                    # Extract text from cells
-                                    for cell in json_page.get("cells", []):
-                                        text = cell.get("content", {}).get(
-                                            "rnormalized", ""
-                                        )
-                                        if text.strip():  # Only append non-empty text
-                                            pdf_text += text.strip() + "\n"
-                                except Exception as page_error:  # pylint: disable=broad-exception-caught
-                                    logger.warning(
-                                        f"Error parsing page {page + 1} of '{file_path}': {page_error}"
-                                    )
-                                    continue
-
-                            if pdf_text:
-                                file_contents.append(pdf_text)
-                                filepaths.append(Path(file_path))
-
-                            # Unload the document to free memory
-                            PDFParser.unload_document(doc_key)
-                            logger.info(f"Unloaded PDF document: {file_path}")
-
-                        else:
-                            logger.info(f"Skipping unsupported file type: {file_path}")
-                    except Exception as file_error:  # pylint: disable=broad-exception-caught
+                        filepaths.append(Path(file_path))
+                        logger.info(f"Collected filepath: {file_path}")
+                    except Exception as file_error:
                         logger.error(
                             f"Error processing file '{file_path}': {file_error}"
                         )
@@ -246,8 +182,8 @@ def _get_documents(
                 else:
                     logger.info(f"Skipping non-file path: {file_path}")
 
-        if file_contents:
-            return file_contents, filepaths
+        if filepaths:
+            return filepaths
         raise SystemExit("Couldn't find knowledge documents")
 
     except (OSError, git.exc.GitCommandError, FileNotFoundError) as e:
@@ -281,13 +217,13 @@ def _read_taxonomy_file(
         task_description = contents.get("task_description", None)
         domain = contents.get("domain")
         documents = contents.get("document")
-        document_contents, doc_filepaths = None, None
+        doc_filepaths = None
         if documents:
             os.makedirs(document_output_dir, exist_ok=True)
             unique_output_dir = mkdtemp(
                 prefix=f"{leaf_node_path}_", dir=document_output_dir
             )
-            document_contents, doc_filepaths = _get_documents(
+            doc_filepaths = _get_documents(
                 source=documents,
                 document_output_dir=unique_output_dir,
             )
@@ -302,7 +238,6 @@ def _read_taxonomy_file(
                         "questions_and_answers": question_answer_list,
                         "context": context,
                         "taxonomy_path": tax_path,
-                        "documents": document_contents,
                         "filepaths": doc_filepaths,
                         "domain": domain,
                         "document_outline": contents.get("document_outline"),
@@ -493,7 +428,7 @@ def leaf_node_to_samples(
     docling_model_path=None,
 ):
     samples = []
-    if leaf_node and leaf_node[0].get("documents"):
+    if leaf_node and (leaf_node[0].get("filepaths")):
         samples = _knowledge_leaf_node_to_samples(
             leaf_node,
             server_ctx_size,
diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index f06db504..ca2c400e 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -55,6 +55,9 @@ def test_chunk_documents(
     chunks = chunker.chunk_documents()
     assert len(chunks) > expected_chunks
     if contains_text:
-        assert contains_text in chunks[0]
+        # Normalize spaces and remove newlines for more flexible text comparison
+        normalized_chunk = ' '.join(chunks[0].replace('\n', ' ').split())
+        normalized_text = ' '.join(contains_text.split())
+        assert normalized_text in normalized_chunk
     for chunk in chunks:
         assert len(chunk) < 2500
diff --git a/tox.ini b/tox.ini
index cf912fd8..765c057f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -12,6 +12,8 @@ description = run tests (unit, unitcov, functional)
 # are huge. This reduces venv from 5.7 GB to 1.5 GB.
 setenv =
     PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu
+pass_env =
+    CI
 package = wheel
 wheel_build_env = pkg
 deps = -r requirements-dev.txt

From 7f18d2888c7c94eab982b7cc41b3a4bb7cbef47d Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 18 Mar 2025 21:06:27 +0000
Subject: [PATCH 2/7] chore: linting..

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/chunkers.py | 18 ++++++++++--------
 src/instructlab/sdg/utils/taxonomy.py |  4 ++--
 tests/functional/test_chunkers.py     |  4 ++--
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index febee1ba..779c2a7b 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -4,8 +4,8 @@
 from typing import Dict, Iterable, List, Optional
 import json
 import logging
-import re
 import os
+import re
 import sys
 
 # Third Party
@@ -40,11 +40,11 @@ def _num_chars_from_tokens(num_tokens) -> int:
 
 
 def resolve_ocr_options(
-        docling_model_path: Optional[Path] = None,
+    docling_model_path: Optional[Path] = None,
 ) -> OcrOptions:
     # Declare ocr_options explicitly as Optional[OcrOptions]
     ocr_options: Optional[OcrOptions] = None
-    
+
     # First, attempt to use tesserocr
     try:
         ocr_options = TesseractOcrOptions()
@@ -52,15 +52,17 @@ def resolve_ocr_options(
         # Third Party
         from docling.models.tesseract_ocr_model import TesseractOcrModel
 
-        _ = TesseractOcrModel(enabled=True, 
-                              artifacts_path=docling_model_path, 
-                              options=ocr_options, 
-                              accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU))
+        _ = TesseractOcrModel(
+            enabled=True,
+            artifacts_path=docling_model_path,
+            options=ocr_options,
+            accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU),
+        )
         return ocr_options
     except ImportError:
         # No tesserocr, so try something else
         logger.warning("Tesseract not found, falling back to EasyOCR.")
-    
+
     try:
         ocr_options = EasyOcrOptions(
             lang=["en"],
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index d10fcd81..ef178a8e 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -11,7 +11,6 @@
 
 # Third Party
 from datasets import Dataset
-
 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS
 from instructlab.schema.taxonomy import (
     TaxonomyMessageFormat,
@@ -142,7 +141,7 @@ def _get_documents(
     repo_url = source.get("repo")
     commit_hash = source.get("commit")
     file_patterns = source.get("patterns", [])
-
+    # pylint: disable=too-many-nested-blocks
     try:
         repo = git.Repo.clone_from(repo_url, document_output_dir)
 
@@ -174,6 +173,7 @@ def _get_documents(
                                     )
                         filepaths.append(Path(file_path))
                         logger.info(f"Collected filepath: {file_path}")
+                    # pylint: disable=broad-exception-caught
                     except Exception as file_error:
                         logger.error(
                             f"Error processing file '{file_path}': {file_error}"
diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py
index ca2c400e..32eeb13e 100644
--- a/tests/functional/test_chunkers.py
+++ b/tests/functional/test_chunkers.py
@@ -56,8 +56,8 @@ def test_chunk_documents(
     assert len(chunks) > expected_chunks
     if contains_text:
         # Normalize spaces and remove newlines for more flexible text comparison
-        normalized_chunk = ' '.join(chunks[0].replace('\n', ' ').split())
-        normalized_text = ' '.join(contains_text.split())
+        normalized_chunk = " ".join(chunks[0].replace("\n", " ").split())
+        normalized_text = " ".join(contains_text.split())
         assert normalized_text in normalized_chunk
     for chunk in chunks:
         assert len(chunk) < 2500

From 30a3310933baddad2f1c69a8fdcb1c4ecd3ded28 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Tue, 18 Mar 2025 21:12:11 +0000
Subject: [PATCH 3/7] fix: make return type Optional for resolve_ocr_options

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/chunkers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
index 779c2a7b..d365c082 100644
--- a/src/instructlab/sdg/utils/chunkers.py
+++ b/src/instructlab/sdg/utils/chunkers.py
@@ -41,7 +41,7 @@ def _num_chars_from_tokens(num_tokens) -> int:
 
 def resolve_ocr_options(
     docling_model_path: Optional[Path] = None,
-) -> OcrOptions:
+) -> Optional[OcrOptions]:
     # Declare ocr_options explicitly as Optional[OcrOptions]
     ocr_options: Optional[OcrOptions] = None
 

From 246e71c6bfa909fb7610af178083735980d87235 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Wed, 19 Mar 2025 19:09:59 +0000
Subject: [PATCH 4/7] test: Add unit tests for _get_documents() method
 behaviors

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 tests/test_chunkers.py | 100 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 99 insertions(+), 1 deletion(-)

diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py
index 42db1ac3..dbca08a4 100644
--- a/tests/test_chunkers.py
+++ b/tests/test_chunkers.py
@@ -2,16 +2,18 @@
 
 # Standard
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock, Mock, patch
 import os
 import tempfile
 
 # Third Party
 from docling.datamodel.pipeline_options import EasyOcrOptions, TesseractOcrOptions
+import git
 import pytest
 
 # First Party
 from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options
+from instructlab.sdg.utils.taxonomy import _get_documents
 
 # Local
 from .testdata import testdata
@@ -120,3 +122,99 @@ def test_invalid_tokenizer(model_name):
     model_path = os.path.join(TEST_DATA_DIR, model_name)
     with pytest.raises(ValueError):
         DocumentChunker.create_tokenizer(model_path)
+
+
+def test_get_documents_basic():
+    """Test successful document retrieval with basic inputs"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {
+            "repo": "https://fake-repo-url.git",
+            "commit": "abc123",
+            "patterns": ["*.md", "*.pdf"],
+        }
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        # Create test files
+        test_md = Path(temp_dir) / "test.md"
+        test_md.write_text("# Test content")
+
+        with patch("git.Repo.clone_from", return_value=mock_repo):
+            result = _get_documents(source, document_output_dir=Path(temp_dir))
+
+        assert len(result) == 1
+        assert result[0].name == "test.md"
+
+
+def test_get_documents_html_warning():
+    """Test warning is logged when markdown contains HTML"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]}
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        # Create test file with HTML
+        test_md = Path(temp_dir) / "test.md"
+        test_md.write_text("# Test\n<div>Some HTML</div>")
+
+        with (
+            patch("git.Repo.clone_from", return_value=mock_repo),
+            patch("logging.Logger.warning") as mock_warning,
+        ):
+            result = _get_documents(source, document_output_dir=Path(temp_dir))
+
+        mock_warning.assert_called_once()
+        assert len(result) == 1
+
+
+def test_get_documents_no_files():
+    """Test error when no valid documents are found"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]}
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        with (
+            patch("git.Repo.clone_from", return_value=mock_repo),
+            pytest.raises(SystemExit),
+        ):
+            _get_documents(source, document_output_dir=Path(temp_dir))
+
+
+def test_get_documents_git_error():
+    """Test handling of git errors"""
+    source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]}
+
+    with patch("git.Repo.clone_from") as mock_clone:
+        mock_clone.side_effect = git.exc.GitCommandError("clone", "error")
+        with pytest.raises(git.exc.GitCommandError):
+            _get_documents(source)
+
+
+def test_get_documents_skip_checkout():
+    """Test that commit checkout is skipped when specified"""
+    with tempfile.TemporaryDirectory() as temp_dir:
+        source = {
+            "repo": "https://fake-repo-url.git",
+            "commit": "abc123",
+            "patterns": ["*.md"],
+        }
+
+        mock_repo = Mock()
+        mock_repo.working_dir = temp_dir
+
+        # Create a test file so the function finds something
+        test_md = Path(temp_dir) / "test.md"
+        test_md.write_text("# Test content")
+
+        with patch("git.Repo.clone_from", return_value=mock_repo) as mock_clone:
+            result = _get_documents(
+                source, skip_checkout=True, document_output_dir=Path(temp_dir)
+            )
+
+        mock_repo.git.checkout.assert_not_called()
+        assert len(result) == 1
+        assert result[0].name == "test.md"

From 8f9910ef8184e6d3fa7781cf6fda805636a381ce Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Thu, 20 Mar 2025 18:14:26 +0000
Subject: [PATCH 5/7] chore: return type, comments

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 src/instructlab/sdg/utils/taxonomy.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
index ef178a8e..cca706d6 100644
--- a/src/instructlab/sdg/utils/taxonomy.py
+++ b/src/instructlab/sdg/utils/taxonomy.py
@@ -3,7 +3,7 @@
 # Standard
 from pathlib import Path
 from tempfile import mkdtemp
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Union
 import glob
 import logging
 import os
@@ -120,7 +120,7 @@ def _get_documents(
     source: Dict[str, Union[str, List[str]]],
     skip_checkout: bool = False,
     document_output_dir: Path = None,
-) -> Tuple[List[Path]]:
+) -> List[Path]:
     """
     Retrieve the file paths of files (Markdown and PDF) from a Git repository.
 
@@ -428,6 +428,7 @@ def leaf_node_to_samples(
     docling_model_path=None,
 ):
     samples = []
+    # check if the leaf node has document filepaths, if so, it's a knowledge leaf node
     if leaf_node and (leaf_node[0].get("filepaths")):
         samples = _knowledge_leaf_node_to_samples(
             leaf_node,

From 0cafab8ee3648825a661839bb1e09f2e860a4496 Mon Sep 17 00:00:00 2001
From: Khaled Sulayman <ksulayma@redhat.com>
Date: Fri, 21 Mar 2025 14:12:04 -0400
Subject: [PATCH 6/7] Update setuptools dependency

Signed-off-by: Khaled Sulayman <ksulayma@redhat.com>
---
 .github/workflows/e2e-nvidia-t4-x1.yml | 3 ++-
 constraints.txt                        | 1 +
 pyproject.toml                         | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)
 create mode 100644 constraints.txt

diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml
index f7dfaa7f..5ef9dd88 100644
--- a/.github/workflows/e2e-nvidia-t4-x1.yml
+++ b/.github/workflows/e2e-nvidia-t4-x1.yml
@@ -19,6 +19,7 @@ on:
       - "pyproject.toml"
       - "requirements**.txt"
       - ".github/workflows/e2e-nvidia-t4-x1.yml" # This workflow
+      - "constraints.txt"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -122,7 +123,7 @@ jobs:
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
           python3.11 -m pip install packaging wheel setuptools-scm
-          python3.11 -m pip install .[cuda]
+          DS_ENABLE_NINJA=0 python3.11 -m pip install -c constraints.txt .[cuda]
         
       - name: Update instructlab-sdg library
         working-directory: ./sdg
diff --git a/constraints.txt b/constraints.txt
new file mode 100644
index 00000000..7d60874a
--- /dev/null
+++ b/constraints.txt
@@ -0,0 +1 @@
+deepspeed==0.15.5
diff --git a/pyproject.toml b/pyproject.toml
index fdb3fce7..cd97ca9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 [build-system]
-requires = ["setuptools>=64", "setuptools_scm>=8"]
+requires = ["setuptools>=70.1.0", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"
 
 [project]

From 9074bc323aaa7e6643cdff55f9d7bd4ef01aa139 Mon Sep 17 00:00:00 2001
From: eshwarprasadS <eshwarprasad.s01@gmail.com>
Date: Wed, 26 Mar 2025 17:57:26 +0000
Subject: [PATCH 7/7] fix: delete constriants, upgrade setuptools ver, CI fix

Signed-off-by: eshwarprasadS <eshwarprasad.s01@gmail.com>
---
 .github/workflows/e2e-nvidia-t4-x1.yml | 3 +--
 constraints.txt                        | 1 -
 pyproject.toml                         | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)
 delete mode 100644 constraints.txt

diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml
index 5ef9dd88..f7dfaa7f 100644
--- a/.github/workflows/e2e-nvidia-t4-x1.yml
+++ b/.github/workflows/e2e-nvidia-t4-x1.yml
@@ -19,7 +19,6 @@ on:
       - "pyproject.toml"
       - "requirements**.txt"
       - ".github/workflows/e2e-nvidia-t4-x1.yml" # This workflow
-      - "constraints.txt"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -123,7 +122,7 @@ jobs:
           # https://github.com/instructlab/instructlab/issues/1821
           # install with Torch and build dependencies installed
           python3.11 -m pip install packaging wheel setuptools-scm
-          DS_ENABLE_NINJA=0 python3.11 -m pip install -c constraints.txt .[cuda]
+          python3.11 -m pip install .[cuda]
         
       - name: Update instructlab-sdg library
         working-directory: ./sdg
diff --git a/constraints.txt b/constraints.txt
deleted file mode 100644
index 7d60874a..00000000
--- a/constraints.txt
+++ /dev/null
@@ -1 +0,0 @@
-deepspeed==0.15.5
diff --git a/pyproject.toml b/pyproject.toml
index cd97ca9c..aba9c07d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 [build-system]
-requires = ["setuptools>=70.1.0", "setuptools_scm>=8"]
+requires = ["setuptools>=78.1.0", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"
 
 [project]