From 84f4c7fecf43e2437f15492239e431c3b9f8437f Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 18 Mar 2025 19:30:41 +0000 Subject: [PATCH 1/7] feat: update docling requirements and improve OCR options handling with bumped ver. Signed-off-by: eshwarprasadS --- requirements.txt | 5 +- src/instructlab/sdg/utils/chunkers.py | 44 +++++++++++--- src/instructlab/sdg/utils/taxonomy.py | 87 ++++----------------------- tests/functional/test_chunkers.py | 5 +- tox.ini | 2 + 5 files changed, 55 insertions(+), 88 deletions(-) diff --git a/requirements.txt b/requirements.txt index ec3c012d..aceefdbc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 click>=8.1.7,<9.0.0 datasets>=2.18.0,<3.0.0 -docling[tesserocr]>=2.4.2,<=2.8.3; sys_platform != 'darwin' -docling>=2.4.2,<=2.8.3; sys_platform == 'darwin' -docling-parse>=2.0.0,<3.0.0 +docling[tesserocr]>=2.18.0; sys_platform != 'darwin' +docling>=2.18.0; sys_platform == 'darwin' GitPython>=3.1.42,<4.0.0 gguf>=0.6.0 httpx>=0.25.0,<1.0.0 diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index e256c969..febee1ba 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -5,12 +5,16 @@ import json import logging import re +import os +import sys # Third Party from datasets import Dataset from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, EasyOcrOptions, OcrOptions, PdfPipelineOptions, @@ -35,7 +39,12 @@ def _num_chars_from_tokens(num_tokens) -> int: return int(num_tokens * 4) # 1 token ~ 4 English character -def resolve_ocr_options() -> OcrOptions: +def resolve_ocr_options( + docling_model_path: Optional[Path] = None, +) -> OcrOptions: + # Declare ocr_options explicitly as Optional[OcrOptions] + ocr_options: Optional[OcrOptions] = None + # First, attempt to use tesserocr try: ocr_options = TesseractOcrOptions() @@ -43,21 +52,35 @@ def resolve_ocr_options() -> OcrOptions: # Third Party from docling.models.tesseract_ocr_model import TesseractOcrModel - _ = TesseractOcrModel(True, ocr_options) + _ = TesseractOcrModel(enabled=True, + artifacts_path=docling_model_path, + options=ocr_options, + accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU)) return ocr_options except ImportError: # No tesserocr, so try something else - pass + logger.warning("Tesseract not found, falling back to EasyOCR.") + try: - ocr_options = EasyOcrOptions() - # Keep easyocr models on the CPU instead of GPU - ocr_options.use_gpu = False + ocr_options = EasyOcrOptions( + lang=["en"], + use_gpu=None, + confidence_threshold=0.5, + model_storage_directory=str(docling_model_path), + recog_network="standard", + download_enabled=True, + ) # triggers torch loading, import lazily # pylint: disable=import-outside-toplevel # Third Party from docling.models.easyocr_model import EasyOcrModel - _ = EasyOcrModel(True, ocr_options) + _ = EasyOcrModel( + enabled=True, + artifacts_path=None, + options=ocr_options, + accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU), + ) return ocr_options except ImportError: # no easyocr either, so don't use any OCR @@ -127,7 +150,12 @@ def _init_docling_converter(self): do_ocr=False, ) - ocr_options = resolve_ocr_options() + # deactivate MPS acceleration on Github CI + if os.getenv("CI") and sys.platform == "darwin": + pipeline_options.accelerator_options = AcceleratorOptions( + device=AcceleratorDevice.CPU + ) + ocr_options = resolve_ocr_options(docling_model_path=self.docling_model_path) if ocr_options is not None: pipeline_options.do_ocr = True pipeline_options.ocr_options = ocr_options diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index ed0d7940..d10fcd81 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -12,8 +12,6 @@ # Third Party from datasets import Dataset -# pylint: disable=no-name-in-module -from docling_parse.docling_parse import pdf_parser_v1 from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS from instructlab.schema.taxonomy import ( TaxonomyMessageFormat, @@ -27,9 +25,6 @@ # Local from .chunkers import DocumentChunker -# Initialize the pdf parser -PDFParser = pdf_parser_v1() - logger = logging.getLogger(__name__) @@ -126,9 +121,9 @@ def _get_documents( source: Dict[str, Union[str, List[str]]], skip_checkout: bool = False, document_output_dir: Path = None, -) -> Tuple[List[str], List[Path]]: +) -> Tuple[List[Path]]: """ - Retrieve the content of files (Markdown and PDF) from a Git repository. + Retrieve the file paths of files (Markdown and PDF) from a Git repository. Args: source (dict): Source info containing repository URL, commit hash, and list of file patterns. @@ -148,13 +143,12 @@ def _get_documents( commit_hash = source.get("commit") file_patterns = source.get("patterns", []) - try: # pylint: disable=too-many-nested-blocks + try: repo = git.Repo.clone_from(repo_url, document_output_dir) if not skip_checkout and commit_hash: repo.git.checkout(commit_hash) - file_contents = [] filepaths = [] logger.info("Processing files...") @@ -170,7 +164,6 @@ def _get_documents( logger.info(f"Processing file: {file_path}") try: if file_path.lower().endswith(".md"): - # Process Markdown files with open(file_path, "r", encoding="utf-8") as file: content = file.read() if _string_contains_html(content): @@ -179,66 +172,9 @@ def _get_documents( "NOTE: Continuing this might affect your data generation quality." "To get best results please format your markdown documents without the use of HTML or use a different document filetype." ) - file_contents.append(content) - filepaths.append(Path(file_path)) - logger.info( - f"Appended Markdown content from {file_path}" - ) - - elif file_path.lower().endswith(".pdf"): - # Process PDF files using docling_parse's pdf_parser_v1 - doc_key = f"key_{os.path.basename(file_path)}" # Unique document key - logger.info(f"Loading PDF document from {file_path}") - - success = PDFParser.load_document(doc_key, file_path) - if not success: - logger.warning( - f"Failed to load PDF document: {file_path}" - ) - continue - - num_pages = PDFParser.number_of_pages(doc_key) - logger.info(f"PDF '{file_path}' has {num_pages} pages.") - - pdf_text = "" - - for page in range(num_pages): - try: - json_doc = PDFParser.parse_pdf_from_key_on_page( - doc_key, page - ) - if "pages" not in json_doc or not json_doc["pages"]: - logger.warning( - f"Page {page + 1} could not be parsed in '{file_path}'" - ) - continue - - json_page = json_doc["pages"][0] - - # Extract text from cells - for cell in json_page.get("cells", []): - text = cell.get("content", {}).get( - "rnormalized", "" - ) - if text.strip(): # Only append non-empty text - pdf_text += text.strip() + "\n" - except Exception as page_error: # pylint: disable=broad-exception-caught - logger.warning( - f"Error parsing page {page + 1} of '{file_path}': {page_error}" - ) - continue - - if pdf_text: - file_contents.append(pdf_text) - filepaths.append(Path(file_path)) - - # Unload the document to free memory - PDFParser.unload_document(doc_key) - logger.info(f"Unloaded PDF document: {file_path}") - - else: - logger.info(f"Skipping unsupported file type: {file_path}") - except Exception as file_error: # pylint: disable=broad-exception-caught + filepaths.append(Path(file_path)) + logger.info(f"Collected filepath: {file_path}") + except Exception as file_error: logger.error( f"Error processing file '{file_path}': {file_error}" ) @@ -246,8 +182,8 @@ def _get_documents( else: logger.info(f"Skipping non-file path: {file_path}") - if file_contents: - return file_contents, filepaths + if filepaths: + return filepaths raise SystemExit("Couldn't find knowledge documents") except (OSError, git.exc.GitCommandError, FileNotFoundError) as e: @@ -281,13 +217,13 @@ def _read_taxonomy_file( task_description = contents.get("task_description", None) domain = contents.get("domain") documents = contents.get("document") - document_contents, doc_filepaths = None, None + doc_filepaths = None if documents: os.makedirs(document_output_dir, exist_ok=True) unique_output_dir = mkdtemp( prefix=f"{leaf_node_path}_", dir=document_output_dir ) - document_contents, doc_filepaths = _get_documents( + doc_filepaths = _get_documents( source=documents, document_output_dir=unique_output_dir, ) @@ -302,7 +238,6 @@ def _read_taxonomy_file( "questions_and_answers": question_answer_list, "context": context, "taxonomy_path": tax_path, - "documents": document_contents, "filepaths": doc_filepaths, "domain": domain, "document_outline": contents.get("document_outline"), @@ -493,7 +428,7 @@ def leaf_node_to_samples( docling_model_path=None, ): samples = [] - if leaf_node and leaf_node[0].get("documents"): + if leaf_node and (leaf_node[0].get("filepaths")): samples = _knowledge_leaf_node_to_samples( leaf_node, server_ctx_size, diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index f06db504..ca2c400e 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -55,6 +55,9 @@ def test_chunk_documents( chunks = chunker.chunk_documents() assert len(chunks) > expected_chunks if contains_text: - assert contains_text in chunks[0] + # Normalize spaces and remove newlines for more flexible text comparison + normalized_chunk = ' '.join(chunks[0].replace('\n', ' ').split()) + normalized_text = ' '.join(contains_text.split()) + assert normalized_text in normalized_chunk for chunk in chunks: assert len(chunk) < 2500 diff --git a/tox.ini b/tox.ini index cf912fd8..765c057f 100644 --- a/tox.ini +++ b/tox.ini @@ -12,6 +12,8 @@ description = run tests (unit, unitcov, functional) # are huge. This reduces venv from 5.7 GB to 1.5 GB. setenv = PIP_EXTRA_INDEX_URL=https://download.pytorch.org/whl/cpu +pass_env = + CI package = wheel wheel_build_env = pkg deps = -r requirements-dev.txt From 7f18d2888c7c94eab982b7cc41b3a4bb7cbef47d Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 18 Mar 2025 21:06:27 +0000 Subject: [PATCH 2/7] chore: linting.. Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/chunkers.py | 18 ++++++++++-------- src/instructlab/sdg/utils/taxonomy.py | 4 ++-- tests/functional/test_chunkers.py | 4 ++-- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index febee1ba..779c2a7b 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -4,8 +4,8 @@ from typing import Dict, Iterable, List, Optional import json import logging -import re import os +import re import sys # Third Party @@ -40,11 +40,11 @@ def _num_chars_from_tokens(num_tokens) -> int: def resolve_ocr_options( - docling_model_path: Optional[Path] = None, + docling_model_path: Optional[Path] = None, ) -> OcrOptions: # Declare ocr_options explicitly as Optional[OcrOptions] ocr_options: Optional[OcrOptions] = None - + # First, attempt to use tesserocr try: ocr_options = TesseractOcrOptions() @@ -52,15 +52,17 @@ def resolve_ocr_options( # Third Party from docling.models.tesseract_ocr_model import TesseractOcrModel - _ = TesseractOcrModel(enabled=True, - artifacts_path=docling_model_path, - options=ocr_options, - accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU)) + _ = TesseractOcrModel( + enabled=True, + artifacts_path=docling_model_path, + options=ocr_options, + accelerator_options=AcceleratorOptions(device=AcceleratorDevice.CPU), + ) return ocr_options except ImportError: # No tesserocr, so try something else logger.warning("Tesseract not found, falling back to EasyOCR.") - + try: ocr_options = EasyOcrOptions( lang=["en"], diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index d10fcd81..ef178a8e 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -11,7 +11,6 @@ # Third Party from datasets import Dataset - from instructlab.schema.taxonomy import DEFAULT_TAXONOMY_FOLDERS as TAXONOMY_FOLDERS from instructlab.schema.taxonomy import ( TaxonomyMessageFormat, @@ -142,7 +141,7 @@ def _get_documents( repo_url = source.get("repo") commit_hash = source.get("commit") file_patterns = source.get("patterns", []) - + # pylint: disable=too-many-nested-blocks try: repo = git.Repo.clone_from(repo_url, document_output_dir) @@ -174,6 +173,7 @@ def _get_documents( ) filepaths.append(Path(file_path)) logger.info(f"Collected filepath: {file_path}") + # pylint: disable=broad-exception-caught except Exception as file_error: logger.error( f"Error processing file '{file_path}': {file_error}" diff --git a/tests/functional/test_chunkers.py b/tests/functional/test_chunkers.py index ca2c400e..32eeb13e 100644 --- a/tests/functional/test_chunkers.py +++ b/tests/functional/test_chunkers.py @@ -56,8 +56,8 @@ def test_chunk_documents( assert len(chunks) > expected_chunks if contains_text: # Normalize spaces and remove newlines for more flexible text comparison - normalized_chunk = ' '.join(chunks[0].replace('\n', ' ').split()) - normalized_text = ' '.join(contains_text.split()) + normalized_chunk = " ".join(chunks[0].replace("\n", " ").split()) + normalized_text = " ".join(contains_text.split()) assert normalized_text in normalized_chunk for chunk in chunks: assert len(chunk) < 2500 From 30a3310933baddad2f1c69a8fdcb1c4ecd3ded28 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Tue, 18 Mar 2025 21:12:11 +0000 Subject: [PATCH 3/7] fix: make return type Optional for resolve_ocr_options Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/chunkers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py index 779c2a7b..d365c082 100644 --- a/src/instructlab/sdg/utils/chunkers.py +++ b/src/instructlab/sdg/utils/chunkers.py @@ -41,7 +41,7 @@ def _num_chars_from_tokens(num_tokens) -> int: def resolve_ocr_options( docling_model_path: Optional[Path] = None, -) -> OcrOptions: +) -> Optional[OcrOptions]: # Declare ocr_options explicitly as Optional[OcrOptions] ocr_options: Optional[OcrOptions] = None From 246e71c6bfa909fb7610af178083735980d87235 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Wed, 19 Mar 2025 19:09:59 +0000 Subject: [PATCH 4/7] test: Add unit tests for _get_documents() method behaviors Signed-off-by: eshwarprasadS --- tests/test_chunkers.py | 100 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/tests/test_chunkers.py b/tests/test_chunkers.py index 42db1ac3..dbca08a4 100644 --- a/tests/test_chunkers.py +++ b/tests/test_chunkers.py @@ -2,16 +2,18 @@ # Standard from pathlib import Path -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, Mock, patch import os import tempfile # Third Party from docling.datamodel.pipeline_options import EasyOcrOptions, TesseractOcrOptions +import git import pytest # First Party from instructlab.sdg.utils.chunkers import DocumentChunker, resolve_ocr_options +from instructlab.sdg.utils.taxonomy import _get_documents # Local from .testdata import testdata @@ -120,3 +122,99 @@ def test_invalid_tokenizer(model_name): model_path = os.path.join(TEST_DATA_DIR, model_name) with pytest.raises(ValueError): DocumentChunker.create_tokenizer(model_path) + + +def test_get_documents_basic(): + """Test successful document retrieval with basic inputs""" + with tempfile.TemporaryDirectory() as temp_dir: + source = { + "repo": "https://fake-repo-url.git", + "commit": "abc123", + "patterns": ["*.md", "*.pdf"], + } + + mock_repo = Mock() + mock_repo.working_dir = temp_dir + + # Create test files + test_md = Path(temp_dir) / "test.md" + test_md.write_text("# Test content") + + with patch("git.Repo.clone_from", return_value=mock_repo): + result = _get_documents(source, document_output_dir=Path(temp_dir)) + + assert len(result) == 1 + assert result[0].name == "test.md" + + +def test_get_documents_html_warning(): + """Test warning is logged when markdown contains HTML""" + with tempfile.TemporaryDirectory() as temp_dir: + source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]} + + mock_repo = Mock() + mock_repo.working_dir = temp_dir + + # Create test file with HTML + test_md = Path(temp_dir) / "test.md" + test_md.write_text("# Test\n
Some HTML
") + + with ( + patch("git.Repo.clone_from", return_value=mock_repo), + patch("logging.Logger.warning") as mock_warning, + ): + result = _get_documents(source, document_output_dir=Path(temp_dir)) + + mock_warning.assert_called_once() + assert len(result) == 1 + + +def test_get_documents_no_files(): + """Test error when no valid documents are found""" + with tempfile.TemporaryDirectory() as temp_dir: + source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]} + + mock_repo = Mock() + mock_repo.working_dir = temp_dir + + with ( + patch("git.Repo.clone_from", return_value=mock_repo), + pytest.raises(SystemExit), + ): + _get_documents(source, document_output_dir=Path(temp_dir)) + + +def test_get_documents_git_error(): + """Test handling of git errors""" + source = {"repo": "https://fake-repo-url.git", "patterns": ["*.md"]} + + with patch("git.Repo.clone_from") as mock_clone: + mock_clone.side_effect = git.exc.GitCommandError("clone", "error") + with pytest.raises(git.exc.GitCommandError): + _get_documents(source) + + +def test_get_documents_skip_checkout(): + """Test that commit checkout is skipped when specified""" + with tempfile.TemporaryDirectory() as temp_dir: + source = { + "repo": "https://fake-repo-url.git", + "commit": "abc123", + "patterns": ["*.md"], + } + + mock_repo = Mock() + mock_repo.working_dir = temp_dir + + # Create a test file so the function finds something + test_md = Path(temp_dir) / "test.md" + test_md.write_text("# Test content") + + with patch("git.Repo.clone_from", return_value=mock_repo) as mock_clone: + result = _get_documents( + source, skip_checkout=True, document_output_dir=Path(temp_dir) + ) + + mock_repo.git.checkout.assert_not_called() + assert len(result) == 1 + assert result[0].name == "test.md" From 8f9910ef8184e6d3fa7781cf6fda805636a381ce Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Thu, 20 Mar 2025 18:14:26 +0000 Subject: [PATCH 5/7] chore: return type, comments Signed-off-by: eshwarprasadS --- src/instructlab/sdg/utils/taxonomy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py index ef178a8e..cca706d6 100644 --- a/src/instructlab/sdg/utils/taxonomy.py +++ b/src/instructlab/sdg/utils/taxonomy.py @@ -3,7 +3,7 @@ # Standard from pathlib import Path from tempfile import mkdtemp -from typing import Dict, List, Tuple, Union +from typing import Dict, List, Union import glob import logging import os @@ -120,7 +120,7 @@ def _get_documents( source: Dict[str, Union[str, List[str]]], skip_checkout: bool = False, document_output_dir: Path = None, -) -> Tuple[List[Path]]: +) -> List[Path]: """ Retrieve the file paths of files (Markdown and PDF) from a Git repository. @@ -428,6 +428,7 @@ def leaf_node_to_samples( docling_model_path=None, ): samples = [] + # check if the leaf node has document filepaths, if so, it's a knowledge leaf node if leaf_node and (leaf_node[0].get("filepaths")): samples = _knowledge_leaf_node_to_samples( leaf_node, From 0cafab8ee3648825a661839bb1e09f2e860a4496 Mon Sep 17 00:00:00 2001 From: Khaled Sulayman Date: Fri, 21 Mar 2025 14:12:04 -0400 Subject: [PATCH 6/7] Update setuptools dependency Signed-off-by: Khaled Sulayman --- .github/workflows/e2e-nvidia-t4-x1.yml | 3 ++- constraints.txt | 1 + pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 constraints.txt diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml index f7dfaa7f..5ef9dd88 100644 --- a/.github/workflows/e2e-nvidia-t4-x1.yml +++ b/.github/workflows/e2e-nvidia-t4-x1.yml @@ -19,6 +19,7 @@ on: - "pyproject.toml" - "requirements**.txt" - ".github/workflows/e2e-nvidia-t4-x1.yml" # This workflow + - "constraints.txt" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -122,7 +123,7 @@ jobs: # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed python3.11 -m pip install packaging wheel setuptools-scm - python3.11 -m pip install .[cuda] + DS_ENABLE_NINJA=0 python3.11 -m pip install -c constraints.txt .[cuda] - name: Update instructlab-sdg library working-directory: ./sdg diff --git a/constraints.txt b/constraints.txt new file mode 100644 index 00000000..7d60874a --- /dev/null +++ b/constraints.txt @@ -0,0 +1 @@ +deepspeed==0.15.5 diff --git a/pyproject.toml b/pyproject.toml index fdb3fce7..cd97ca9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 [build-system] -requires = ["setuptools>=64", "setuptools_scm>=8"] +requires = ["setuptools>=70.1.0", "setuptools_scm>=8"] build-backend = "setuptools.build_meta" [project] From 9074bc323aaa7e6643cdff55f9d7bd4ef01aa139 Mon Sep 17 00:00:00 2001 From: eshwarprasadS Date: Wed, 26 Mar 2025 17:57:26 +0000 Subject: [PATCH 7/7] fix: delete constriants, upgrade setuptools ver, CI fix Signed-off-by: eshwarprasadS --- .github/workflows/e2e-nvidia-t4-x1.yml | 3 +-- constraints.txt | 1 - pyproject.toml | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) delete mode 100644 constraints.txt diff --git a/.github/workflows/e2e-nvidia-t4-x1.yml b/.github/workflows/e2e-nvidia-t4-x1.yml index 5ef9dd88..f7dfaa7f 100644 --- a/.github/workflows/e2e-nvidia-t4-x1.yml +++ b/.github/workflows/e2e-nvidia-t4-x1.yml @@ -19,7 +19,6 @@ on: - "pyproject.toml" - "requirements**.txt" - ".github/workflows/e2e-nvidia-t4-x1.yml" # This workflow - - "constraints.txt" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -123,7 +122,7 @@ jobs: # https://github.com/instructlab/instructlab/issues/1821 # install with Torch and build dependencies installed python3.11 -m pip install packaging wheel setuptools-scm - DS_ENABLE_NINJA=0 python3.11 -m pip install -c constraints.txt .[cuda] + python3.11 -m pip install .[cuda] - name: Update instructlab-sdg library working-directory: ./sdg diff --git a/constraints.txt b/constraints.txt deleted file mode 100644 index 7d60874a..00000000 --- a/constraints.txt +++ /dev/null @@ -1 +0,0 @@ -deepspeed==0.15.5 diff --git a/pyproject.toml b/pyproject.toml index cd97ca9c..aba9c07d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 [build-system] -requires = ["setuptools>=70.1.0", "setuptools_scm>=8"] +requires = ["setuptools>=78.1.0", "setuptools_scm>=8"] build-backend = "setuptools.build_meta" [project]