update workflows

baberabb · baberabb · commit 1d8235450843 · 2025-11-27T19:58:10.000+05:00
diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml
@@ -16,7 +16,7 @@ jobs:
     name: Scan for changed tasks
     steps:
       - name: checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v6
         with:
           fetch-depth: 2  # OR "2" -> To retrieve the preceding commit.
 
@@ -25,7 +25,7 @@ jobs:
       # and prepends the filter name to the standard output names.
       - name: Check task folders
         id: changed-tasks
-        uses: tj-actions/changed-files@v46.0.5
+        uses: tj-actions/changed-files@24d32ffd492484c1d75e0c0b894501ddb9d30d62
         with:
           # tasks checks the tasks folder and api checks the api folder for changes
           files_yaml: |
@@ -44,28 +44,24 @@ jobs:
           echo "One or more test file(s) has changed."
           echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
 
-      - name: Set up Python 3.10
+      - name: Install uv
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
-        uses: actions/setup-python@v5
+        uses: astral-sh/setup-uv@v7
         with:
-          python-version: '3.10'
-          cache: 'pip'
-          cache-dependency-path: pyproject.toml
+          enable-cache: true
+          python-version: "3.10"
+          activate-environment: true
       - name: Install dependencies
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
         run: |
-            python -m pip install --upgrade pip
-            pip install -e '.[dev,ifeval,unitxt,math,longbench]' --extra-index-url https://download.pytorch.org/whl/cpu
-    #   Install optional git dependencies
-    #       pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
-    #       if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+          uv pip install -e '.[dev,ifeval,unitxt,math,longbench,hf]' --extra-index-url https://download.pytorch.org/whl/cpu
       - name: Test with pytest
         # if new tasks are added, run tests on them
         if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv
+        run: pytest -x -s -vv tests/test_tasks.py
         # if api is modified, run tests on it
       - name: Test more tasks with pytest
         env:
           API: true
         if: steps.changed-tasks.outputs.api_any_modified == 'true'
-        run: python -m pytest tests/test_tasks.py -s -vv
+        run: pytest -x -s -vv -n=auto tests/test_tasks.py
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -21,13 +21,15 @@ jobs:
 
     steps:
       - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python 3.10
-        uses: actions/setup-python@v5
+        uses: actions/checkout@v6
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
         with:
-          python-version: '3.10'
-          cache: pip
-          cache-dependency-path: pyproject.toml
+          enable-cache: true
+          python-version: "3.10"
+          activate-environment: true
+      - name: Install pip
+        run: uv pip install pip
       - name: Pre-Commit
         env:
           SKIP: "no-commit-to-branch,mypy"
@@ -43,13 +45,13 @@ jobs:
     timeout-minutes: 30
     steps:
       - name: Checkout Code
-        uses: actions/checkout@v4
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
+        uses: actions/checkout@v6
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
         with:
+          enable-cache: true
           python-version: ${{ matrix.python-version }}
-          cache: pip
-          cache-dependency-path: pyproject.toml
+          activate-environment: true
 
       # Cache HuggingFace cache directory for CPU tests
       - name: Cache HuggingFace cache (CPU tests)
@@ -63,17 +65,16 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          pip install -e '.[dev,unitxt]' --extra-index-url https://download.pytorch.org/whl/cpu
-          pip install hf_xet
+          uv pip install -e '.[dev,unitxt,hf]' --extra-index-url https://download.pytorch.org/whl/cpu
+          uv pip install hf_xet
 
       - name: Test with pytest
-        run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py
-        continue-on-error: true  # Continue workflow even if tests fail
+        run: pytest -x --showlocals -s -vv -n=auto --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py --ignore=tests/scripts/test_zeno_visualize.py
 
       # Save test artifacts
       - name: Archive test artifacts
-        uses: actions/upload-artifact@v4
+        if: always()  # Upload artifacts even if tests fail
+        uses: actions/upload-artifact@v5
         with:
           name: output_testcpu${{ matrix.python-version }}
           path: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,7 +27,7 @@ repos:
       - id: mixed-line-ending
         args: [ --fix=lf ]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.13.2
+    rev: v0.14.6
     hooks:
       # Run the linter.
       - id: ruff-check
@@ -46,7 +46,7 @@ repos:
 
         args: [ --check-filenames, --check-hidden, --ignore-words=ignore.txt ]
   - repo: https://github.com/jackdewinter/pymarkdown
-    rev: v0.9.32
+    rev: v0.9.33
     hooks:
       - id: pymarkdown
         exclude: ^(lm_eval/tasks/.*|docs/footguns\.md)$
diff --git a/tests/models/test_bos_handling.py b/tests/models/test_bos_handling.py
@@ -46,16 +46,16 @@ def pythia_tokenizer():
 
 
 @pytest.fixture(scope="module")
-def gemma_tokenizer():
+def olmo_tokenizer():
     """
-    Load gemma-2-2b-it tokenizer for testing.
+    Load OLMo-3-7B-Instruct tokenizer for testing.
 
     Properties:
-    - BOS token: '<bos>' (ID: 2)
+    - BOS token: '<|endoftext|>' (ID: 100257)
     - DOES add BOS by default (add_bos_token=True in tokenizer)
     - Used to test tokenizers that add BOS by default
     """
-    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")
+    tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-3-7B-Instruct")
     # Set pad token to avoid padding errors in batch encoding
     tokenizer.pad_token = tokenizer.eos_token
     tokenizer.padding_side = "right"
@@ -150,14 +150,14 @@ def test_both_none_returns_empty(self):
 class TestDefaultsToNone:
     """Test that add_bos_token defaults to None, allowing tokenizer defaults."""
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     def test_huggingface_none_uses_tokenizer_default(self, tokenizer_name, request):
         """
         HF: When add_bos_token=None, should respect tokenizer's default.
 
         Tests both tokenizer types:
         - Pythia: Doesn't add BOS by default
-        - Gemma: DOES add BOS by default
+        - OLMo: DOES add BOS by default
         """
         tokenizer = request.getfixturevalue(tokenizer_name)
         mock_hflm = create_hf_mock(tokenizer, add_bos_token=None)
@@ -166,14 +166,14 @@ def test_huggingface_none_uses_tokenizer_default(self, tokenizer_name, request):
         expected = tokenizer.encode("Hello")
         assert result == expected
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     def test_vllm_none_uses_tokenizer_default(self, tokenizer_name, request):
         """
         vLLM: When add_bos_token=None, should respect tokenizer's default.
 
         Tests both tokenizer types:
         - Pythia: Doesn't add BOS by default
-        - Gemma: DOES add BOS by default
+        - OLMo: DOES add BOS by default
         """
         tokenizer = request.getfixturevalue(tokenizer_name)
         mock_vllm = create_vllm_mock(tokenizer, add_bos_token=None)
@@ -191,7 +191,7 @@ def test_vllm_none_uses_tokenizer_default(self, tokenizer_name, request):
 class TestNoDuplicateBos:
     """Test that BOS tokens are never duplicated when already present."""
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     def test_huggingface_detects_bos_in_single_string(self, tokenizer_name, request):
         """HF: Should detect BOS prefix and avoid duplication."""
         tokenizer = request.getfixturevalue(tokenizer_name)
@@ -215,7 +215,7 @@ def test_huggingface_detects_bos_in_single_string(self, tokenizer_name, request)
         # Should avoid duplication (fewer or equal tokens)
         assert input_ids.shape[1] <= without_detection.shape[1]
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     def test_huggingface_adds_bos_when_missing(self, tokenizer_name, request):
         """HF: Should add BOS when string doesn't have it (using add_special_tokens=True)"""
         tokenizer = request.getfixturevalue(tokenizer_name)
@@ -228,13 +228,13 @@ def test_huggingface_adds_bos_when_missing(self, tokenizer_name, request):
 
         assert input_ids.tolist() == expected.tolist()
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     def test_huggingface_follows_tokenizer_default(self, tokenizer_name, request):
         """
         HF: When add_bos_token is not set (None), follows tokenizer default.
 
         - Pythia: Doesn't add BOS by default
-        - Gemma: DOES add BOS by default
+        - OLMo: DOES add BOS by default
         """
         tokenizer = request.getfixturevalue(tokenizer_name)
         mock_hflm = create_hf_mock(tokenizer, add_bos_token=None)
@@ -244,7 +244,7 @@ def test_huggingface_follows_tokenizer_default(self, tokenizer_name, request):
 
         assert input_ids.tolist() == expected.tolist()
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     @pytest.mark.parametrize("add_bos_token", [None, True])
     def test_vllm_handles_mixed_batch(self, tokenizer_name, add_bos_token, request):
         """
@@ -284,7 +284,7 @@ def test_vllm_handles_mixed_batch(self, tokenizer_name, add_bos_token, request):
         for i, exp in enumerate(expected):
             assert result[i] == exp
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     @pytest.mark.parametrize("add_bos_token", [None, True])
     def test_vllm_preserves_order_in_mixed_batch(
         self, tokenizer_name, add_bos_token, request
@@ -328,7 +328,7 @@ def test_vllm_preserves_order_in_mixed_batch(
 class TestChatTemplateCompatibility:
     """Test that chat templates (which add BOS) work without duplication."""
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     def test_huggingface_chat_template_no_duplicate_bos(self, tokenizer_name, request):
         """
         HF: Chat template adds BOS, tokenizer should not add another.
@@ -352,7 +352,7 @@ def test_huggingface_chat_template_no_duplicate_bos(self, tokenizer_name, reques
 
         assert torch.equal(input_ids, expected)
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     @pytest.mark.parametrize("add_bos_token", [None, True])
     def test_vllm_mixed_chat_batch(self, tokenizer_name, add_bos_token, request):
         """
@@ -424,7 +424,7 @@ def test_huggingface_seq2seq_skips_causal_bos_logic(self, pythia_tokenizer):
 class TestLoglikelihoodBosHandling:
     """Test BOS handling in loglikelihood method."""
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     @pytest.mark.parametrize("add_bos_token", [None, True])
     def test_empty_context_continuation_with_bos(
         self, tokenizer_name, add_bos_token, request
@@ -479,7 +479,7 @@ def capture_and_return(reqs, disable_tqdm=False):
         )
         assert continuation_enc == continuation_without_bos[1:]  # Skip the BOS token
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     @pytest.mark.parametrize("add_bos_token", [None, True])
     def test_empty_context_continuation_without_bos(
         self, tokenizer_name, add_bos_token, request
@@ -523,7 +523,7 @@ def capture_and_return(reqs, disable_tqdm=False):
         expected_continuation = tokenizer.encode(continuation, add_special_tokens=False)
         assert continuation_enc == expected_continuation
 
-    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "gemma_tokenizer"])
+    @pytest.mark.parametrize("tokenizer_name", ["pythia_tokenizer", "olmo_tokenizer"])
     @pytest.mark.parametrize("add_bos_token", [None, True])
     def test_context_with_bos_prefix(self, tokenizer_name, add_bos_token, request):
         """When context starts with BOS (e.g., from chat template), should not duplicate BOS."""
diff --git a/tests/scripts/test_zeno_visualize.py b/tests/scripts/test_zeno_visualize.py
@@ -3,10 +3,11 @@
 
 import pytest
 
+
+pytest.importorskip("zeno_client")
 from scripts.zeno_visualize import sanitize_string
 
 
-@pytest.skip("requires zeno_client dependency")
 def test_zeno_sanitize_string():
     """
     Test that the model_args handling logic in zeno_visualize.py properly handles