[CI] Speed up slow tests in tests-gpu/tests-cpu (#3395)

vmoens · web-flow · commit f9ca74828188 · 2026-01-27T15:02:53.000Z
diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh
@@ -115,6 +115,7 @@ uv_pip_install \
   pytest-forked \
   pytest-asyncio \
   pytest-isolate \
+  pytest-xdist \
   expecttest \
   "pybind11[global]>=2.13" \
   pyyaml \
@@ -285,18 +286,65 @@ run_distributed_tests() {
     echo "TORCHRL_TEST_SUITE=${TORCHRL_TEST_SUITE}: distributed tests require GPU (CU_VERSION != cpu)."
     return 1
   fi
-  python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py \
+  # Run both test_distributed.py and test_rb_distributed.py (both use torch.distributed)
+  python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py test/test_rb_distributed.py \
     --instafail --durations 200 -vv --capture no \
     --timeout=120 --mp_fork_if_no_cuda
 }
 
 run_non_distributed_tests() {
   # Note: we always ignore distributed tests here (they can be run in a separate job).
-  python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
-    --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \
-    --ignore test/test_distributed.py \
-    --ignore test/llm \
-    --timeout=120 --mp_fork_if_no_cuda
+  # Also ignore test_setup.py as it's tested in the dedicated test-setup-minimal job.
+  #
+  # Test sharding: Split tests into groups for parallel execution.
+  # TORCHRL_TEST_SHARD can be: "all" (default), "1", "2", or "3"
+  # - Shard 1: test_transforms.py (heaviest file, 571 parametrize decorators)
+  # - Shard 2: test_envs.py, test_collectors.py (multiprocessing-heavy)
+  # - Shard 3: Everything else (can use pytest-xdist for parallelism)
+  local shard="${TORCHRL_TEST_SHARD:-all}"
+  local common_ignores="--ignore test/test_rlhf.py --ignore test/test_distributed.py --ignore test/test_rb_distributed.py --ignore test/llm --ignore test/test_setup.py"
+  local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda"
+  
+  # pytest-xdist parallelism: use -n auto for shard 3 (fewer multiprocessing tests)
+  # Set TORCHRL_XDIST=0 to disable parallel execution
+  local xdist_args=""
+  if [ "${TORCHRL_XDIST:-1}" = "1" ] && [ "${shard}" = "3" ]; then
+    xdist_args="-n auto --dist loadgroup"
+    echo "Using pytest-xdist for parallel execution"
+  fi
+
+  case "${shard}" in
+    1)
+      echo "Running shard 1: test_transforms.py only"
+      python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_transforms.py \
+        ${common_args}
+      ;;
+    2)
+      echo "Running shard 2: test_envs.py and test_collectors.py"
+      python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_envs.py test/test_collectors.py \
+        ${common_args}
+      ;;
+    3)
+      echo "Running shard 3: All other tests"
+      python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
+        ${common_ignores} \
+        --ignore test/test_transforms.py \
+        --ignore test/test_envs.py \
+        --ignore test/test_collectors.py \
+        ${xdist_args} \
+        ${common_args}
+      ;;
+    all|"")
+      echo "Running all tests (no sharding)"
+      python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
+        ${common_ignores} \
+        ${common_args}
+      ;;
+    *)
+      echo "Unknown TORCHRL_TEST_SHARD='${shard}'. Expected: all|1|2|3."
+      exit 2
+      ;;
+  esac
 }
 
 case "${TORCHRL_TEST_SUITE}" in
diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml
@@ -77,6 +77,11 @@ jobs:
       matrix:
         python_version: ["3.12"]
         cuda_arch_version: ["13.0"]
+        # Test sharding: split tests into 3 parallel jobs for faster execution
+        # Shard 1: test_transforms.py (heaviest)
+        # Shard 2: test_envs.py + test_collectors.py (multiprocessing-heavy)
+        # Shard 3: all other tests
+        shard: ["1", "2", "3"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
@@ -103,12 +108,14 @@ jobs:
 
         # Run everything except distributed tests; those run in parallel in tests-gpu-distributed.
         export TORCHRL_TEST_SUITE=nondistributed
+        export TORCHRL_TEST_SHARD=${{ matrix.shard }}
 
         # Remove the following line when the GPU tests are working inside docker, and uncomment the above lines
         #export CU_VERSION="cpu"
 
         echo "PYTHON_VERSION: $PYTHON_VERSION"
         echo "CU_VERSION: $CU_VERSION"
+        echo "TORCHRL_TEST_SHARD: $TORCHRL_TEST_SHARD"
 
         ## setup_env.sh
         bash .github/unittest/linux/scripts/run_all.sh
@@ -227,6 +234,8 @@ jobs:
       matrix:
         python_version: ["3.12"] # "3.9", "3.10", "3.11"
         cuda_arch_version: ["13.0"] # "11.6", "11.7"
+        # Test sharding: split tests into 3 parallel jobs for faster execution
+        shard: ["1", "2", "3"]
       fail-fast: false
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
@@ -260,6 +269,9 @@ jobs:
 
         # Run everything except distributed tests; those run in parallel in tests-stable-gpu-distributed.
         export TORCHRL_TEST_SUITE=nondistributed
+        export TORCHRL_TEST_SHARD=${{ matrix.shard }}
+
+        echo "TORCHRL_TEST_SHARD: $TORCHRL_TEST_SHARD"
 
         ## setup_env.sh
         bash .github/unittest/linux/scripts/run_all.sh
diff --git a/test/conftest.py b/test/conftest.py
@@ -18,26 +18,36 @@
 IS_OSX = sys.platform == "darwin"
 
 
-def pytest_sessionfinish(maxprint=50):
-    out_str = """
-Call times:
-===========
-"""
+def pytest_sessionfinish(session, exitstatus, maxprint=50):
+    """Print aggregated test times per function (across all parametrizations)."""
     keys = list(CALL_TIMES.keys())
-    if len(keys) > 1:
-        maxchar = max(*[len(key) for key in keys])
-    elif len(keys) == 1:
-        maxchar = len(keys[0])
-    else:
+    if not keys:
         return
+
+    # Calculate total time
+    total_time = sum(CALL_TIMES.values())
+
+    out_str = f"""
+================================================================================
+AGGREGATED TEST TIMES (by function, across all parametrizations)
+================================================================================
+Total test time: {total_time:.1f}s ({total_time/60:.1f} min)
+Top {min(maxprint, len(keys))} slowest test functions:
+--------------------------------------------------------------------------------
+"""
+    maxchar = max(len(key) for key in keys)
     for i, (key, item) in enumerate(
         sorted(CALL_TIMES.items(), key=lambda x: x[1], reverse=True)
     ):
-        spaces = "  " + " " * (maxchar - len(key))
-        out_str += f"\t{key}{spaces}{item: 4.4f}s\n"
+        spaces = " " * (maxchar - len(key) + 2)
+        pct = (item / total_time) * 100 if total_time > 0 else 0
+        out_str += f"  {key}{spaces}{item:7.2f}s  ({pct:5.1f}%)\n"
         if i == maxprint - 1:
             break
 
+    out_str += "================================================================================\n"
+    sys.stdout.write(out_str)
+
 
 @pytest.fixture(autouse=True)
 def measure_duration(request: pytest.FixtureRequest):
diff --git a/test/test_collectors.py b/test/test_collectors.py
@@ -759,8 +759,8 @@ def env_fn(seed):
             create_env_kwargs={"seed": seed},
             policy=policy,
             frames_per_batch=20,
-            max_frames_per_traj=2000,
-            total_frames=20000,
+            max_frames_per_traj=200,
+            total_frames=200,
             device="cpu",
         )
         torchrl_logger.info("Loop")
@@ -932,7 +932,7 @@ def _set_seed(self, seed: Optional[int]) -> None:
         result = subprocess.run(
             ["python", "-c", script], capture_output=True, text=True
         )
-        # This errors if the timeout is 5 secs, not 15
+        # This errors if the timeout is too short (3), succeeds if long enough (10)
         assert result.returncode == int(
             to == 3
         ), f"Test failed with output: {result.stdout}"
@@ -1136,7 +1136,7 @@ def make_and_test_policy(
             c = collector_type(
                 envs,
                 policy=policy,
-                total_frames=1000,
+                total_frames=100,
                 frames_per_batch=10,
                 policy_device=policy_device,
                 env_device=env_device,
@@ -1779,7 +1779,7 @@ def _reset(self, tensordict: TensorDict | None = None, **kwargs) -> TensorDict:
                 # Random sleep up to 10ms
                 time.sleep(torch.rand(1).item() * 0.01)
             elif self.env_id % 2 == 1:
-                time.sleep(1)
+                time.sleep(0.1)
 
             self._step_count = 0
             return TensorDict(
@@ -1800,7 +1800,7 @@ def _step(self, tensordict: TensorDict) -> TensorDict:
             done = self._step_count >= self.max_steps
 
             if self.sleep_odd_only and self.env_id % 2 == 1:
-                time.sleep(1)
+                time.sleep(0.1)
 
             return TensorDict(
                 {