diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml index b15d6114dafd..227adbe19efe 100644 --- a/.github/workflows/_build_torch_with_cuda.yml +++ b/.github/workflows/_build_torch_with_cuda.yml @@ -22,6 +22,9 @@ jobs: image: ${{ inputs.dev-image }} env: _GLIBCXX_USE_CXX11_ABI: 0 + TORCH_CUDA_ARCH_LIST: "5.2;7.0;7.5;8.0;9.0" + USE_CUDA: 1 + MAX_JOBS: 24 steps: - name: Checkout actions uses: actions/checkout@v4 @@ -34,18 +37,11 @@ jobs: with: torch-commit: ${{ inputs.torch-commit }} cuda: true - - name: Checkout PyTorch Repo - uses: actions/checkout@v4 - with: - repository: pytorch/pytorch - path: pytorch - ref: ${{ inputs.torch-commit }} - submodules: recursive - name: Build PyTorch with CUDA enabled shell: bash run: | cd pytorch - TORCH_CUDA_ARCH_LIST="5.2;8.6" USE_CUDA=1 MAX_JOBS="$(nproc --ignore=4)" python setup.py bdist_wheel + python setup.py bdist_wheel - name: Upload wheel uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml index a2440230f6d5..1cb844e464af 100644 --- a/.github/workflows/_test_requiring_torch_cuda.yml +++ b/.github/workflows/_test_requiring_torch_cuda.yml @@ -94,8 +94,12 @@ jobs: pip install -U --pre jaxlib -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html pip install -U --pre jax-cuda12-pjrt jax-cuda12-plugin -f https://storage.googleapis.com/jax-releases/jax_cuda_plugin_nightly_releases.html pip install -U --pre jax -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html - pip install --no-deps triton==2.3.0 if: ${{ matrix.run_triton_tests }} + - name: Install Triton + shell: bash + run: | + cd pytorch + make triton - name: Python Tests shell: bash run: | @@ -106,5 +110,5 @@ jobs: - name: Triton Tests shell: bash run: | - PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.1/bin/ptxas python pytorch/xla/test/test_triton.py + PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.3/bin/ptxas python pytorch/xla/test/test_triton.py if: ${{ matrix.run_triton_tests }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 683af2abaa1e..393a388af0c3 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -42,25 +42,25 @@ jobs: secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} - # Disable due to https://github.com/pytorch/xla/issues/8199 - # build-torch-with-cuda: - # name: "Build PyTorch with CUDA" - # uses: ./.github/workflows/_build_torch_with_cuda.yml - # needs: get-torch-commit - # with: - # # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner. - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} - # runner: linux.24xlarge + build-torch-with-cuda: + name: "Build PyTorch with CUDA" + uses: ./.github/workflows/_build_torch_with_cuda.yml + needs: get-torch-commit + with: + # TODO: bump CUDA version to either 12.4 or 12.6 (supported by PyTorch). + # Ref: https://github.com/pytorch/xla/issues/8700 + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} + # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner. + runner: linux.24xlarge - # Disable due to https://github.com/pytorch/xla/issues/8199 - # build-cuda-plugin: - # name: "Build XLA CUDA plugin" - # uses: ./.github/workflows/_build_plugin.yml - # with: - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # secrets: - # gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + build-cuda-plugin: + name: "Build XLA CUDA plugin" + uses: ./.github/workflows/_build_plugin.yml + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + secrets: + gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} test-python-cpu: name: "CPU tests" @@ -74,32 +74,30 @@ jobs: secrets: gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} - # Disable due to https://github.com/pytorch/xla/issues/8199 - # test-cuda: - # name: "GPU tests" - # uses: ./.github/workflows/_test.yml - # needs: [build-torch-xla, build-cuda-plugin, get-torch-commit] - # with: - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # runner: linux.8xlarge.nvidia.gpu - # timeout-minutes: 300 - # collect-coverage: false - # install-cuda-plugin: true - # torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} - # secrets: - # gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} + test-cuda: + name: "GPU tests" + uses: ./.github/workflows/_test.yml + needs: [build-torch-xla, build-cuda-plugin, get-torch-commit] + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + runner: linux.g4dn.12xlarge.nvidia.gpu + timeout-minutes: 300 + collect-coverage: false + install-cuda-plugin: true + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} + secrets: + gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }} - # Disable due to https://github.com/pytorch/xla/issues/8199 - # test-cuda-with-pytorch-cuda-enabled: - # name: "GPU tests requiring torch CUDA" - # uses: ./.github/workflows/_test_requiring_torch_cuda.yml - # needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit] - # with: - # dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1 - # runner: linux.8xlarge.nvidia.gpu - # timeout-minutes: 300 - # collect-coverage: false - # torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} + test-cuda-with-pytorch-cuda-enabled: + name: "GPU tests requiring torch CUDA" + uses: ./.github/workflows/_test_requiring_torch_cuda.yml + needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit] + with: + dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3 + runner: linux.8xlarge.nvidia.gpu + timeout-minutes: 300 + collect-coverage: false + torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}} test-tpu: name: "TPU tests" diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml index 6953f99cac36..574b85e5b0d5 100644 --- a/.github/workflows/setup/action.yml +++ b/.github/workflows/setup/action.yml @@ -29,8 +29,8 @@ runs: - name: Setup CUDA environment shell: bash run: | - echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV - echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV + echo "PATH=$PATH:/usr/local/cuda-12.3/bin" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.3/lib64" >> $GITHUB_ENV if: ${{ inputs.cuda }} - name: Setup gcloud shell: bash diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml index 120a6ae7ae93..845264ceb691 100644 --- a/infra/ansible/config/vars.yaml +++ b/infra/ansible/config/vars.yaml @@ -2,7 +2,7 @@ cuda_repo: debian11 cuda_version: "11.8" # Determines supported GPUs. See https://developer.nvidia.com/cuda-gpus -cuda_compute_capabilities: 7.0,7.5,8.0,9.0 +cuda_compute_capabilities: 5.2,7.0,7.5,8.0,9.0 # Used for fetching clang from the right repo, see apt.yaml. llvm_debian_repo: bullseye clang_version: 17 diff --git a/test/test_triton.py b/test/test_triton.py index 3854b790cdbe..aa87b9884a7e 100644 --- a/test/test_triton.py +++ b/test/test_triton.py @@ -6,6 +6,7 @@ import torch_xla.experimental.triton as xla_triton import torch_xla from torch_xla import runtime as xr +from torch_xla.test.test_utils import skipIfCUDA import triton import triton.language as tl @@ -241,6 +242,8 @@ def _attn_fwd( tl.store(O_block_ptr, acc.to(Out.type.element_ty)) +# Ref: https://github.com/pytorch/xla/pull/8593 +@skipIfCUDA("GPU CI is failing") class TritonTest(unittest.TestCase): @unittest.skipIf(xr.device_type() != 'CUDA', "This test only works on GPU.") diff --git a/test/torch_distributed/test_ddp.py b/test/torch_distributed/test_ddp.py index 61a8ef8a5935..d4e3fc77c7f2 100644 --- a/test/torch_distributed/test_ddp.py +++ b/test/torch_distributed/test_ddp.py @@ -3,6 +3,7 @@ import sys import torch_xla import torch_xla.core.xla_model as xm +from torch_xla.test.test_utils import skipIfCUDA # Setup import folders. xla_test_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0]))) @@ -38,6 +39,8 @@ def _ddp_correctness(rank, def test_ddp_correctness(self): torch_xla.launch(self._ddp_correctness, args=(False, FLAGS.debug)) + # Ref: https://github.com/pytorch/xla/pull/8593 + @skipIfCUDA("GPU CI is failing") def test_ddp_correctness_with_gradient_as_bucket_view(self): torch_xla.launch(self._ddp_correctness, args=(False, FLAGS.debug, True)) diff --git a/torch_xla/test/test_utils.py b/torch_xla/test/test_utils.py index 92b5dcb111f2..6e9f779c5f0a 100644 --- a/torch_xla/test/test_utils.py +++ b/torch_xla/test/test_utils.py @@ -11,6 +11,11 @@ import torch_xla.utils.utils as xu +def skipIfCUDA(reason): + accelerator = xr.device_type() or "" + return lambda f: unittest.skipIf(accelerator.lower() == "cuda", reason)(f) + + def mp_test(func): """Wraps a `unittest.TestCase` function running it within an isolated process.