diff --git a/.github/workflows/_build_torch_with_cuda.yml b/.github/workflows/_build_torch_with_cuda.yml
index b15d6114dafd..227adbe19efe 100644
--- a/.github/workflows/_build_torch_with_cuda.yml
+++ b/.github/workflows/_build_torch_with_cuda.yml
@@ -22,6 +22,9 @@ jobs:
       image: ${{ inputs.dev-image }}
     env:
       _GLIBCXX_USE_CXX11_ABI: 0
+      TORCH_CUDA_ARCH_LIST: "5.2;7.0;7.5;8.0;9.0"
+      USE_CUDA: 1
+      MAX_JOBS: 24
     steps:
       - name: Checkout actions
         uses: actions/checkout@v4
@@ -34,18 +37,11 @@ jobs:
         with:
           torch-commit: ${{ inputs.torch-commit }}
           cuda: true
-      - name: Checkout PyTorch Repo
-        uses: actions/checkout@v4
-        with:
-          repository: pytorch/pytorch
-          path: pytorch
-          ref: ${{ inputs.torch-commit }}
-          submodules: recursive
       - name: Build PyTorch with CUDA enabled
         shell: bash
         run: |
           cd pytorch
-          TORCH_CUDA_ARCH_LIST="5.2;8.6" USE_CUDA=1 MAX_JOBS="$(nproc --ignore=4)" python setup.py bdist_wheel
+          python setup.py bdist_wheel
       - name: Upload wheel
         uses: actions/upload-artifact@v4
         with:
diff --git a/.github/workflows/_test_requiring_torch_cuda.yml b/.github/workflows/_test_requiring_torch_cuda.yml
index a2440230f6d5..1cb844e464af 100644
--- a/.github/workflows/_test_requiring_torch_cuda.yml
+++ b/.github/workflows/_test_requiring_torch_cuda.yml
@@ -94,8 +94,12 @@ jobs:
           pip install -U --pre jaxlib -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
           pip install -U --pre jax-cuda12-pjrt jax-cuda12-plugin -f https://storage.googleapis.com/jax-releases/jax_cuda_plugin_nightly_releases.html
           pip install -U --pre jax -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
-          pip install --no-deps triton==2.3.0
         if: ${{ matrix.run_triton_tests }}
+      - name: Install Triton
+        shell: bash
+        run: |
+          cd pytorch
+          make triton
       - name: Python Tests
         shell: bash
         run: |
@@ -106,5 +110,5 @@ jobs:
       - name: Triton Tests
         shell: bash
         run: |
-          PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.1/bin/ptxas python pytorch/xla/test/test_triton.py
+          PJRT_DEVICE=CUDA TRITON_PTXAS_PATH=/usr/local/cuda-12.3/bin/ptxas python pytorch/xla/test/test_triton.py
         if: ${{ matrix.run_triton_tests }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 683af2abaa1e..393a388af0c3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -42,25 +42,25 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # build-torch-with-cuda:
-  #   name: "Build PyTorch with CUDA"
-  #   uses: ./.github/workflows/_build_torch_with_cuda.yml
-  #   needs: get-torch-commit
-  #   with:
-  #     # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #     torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-  #     runner: linux.24xlarge
+  build-torch-with-cuda:
+    name: "Build PyTorch with CUDA"
+    uses: ./.github/workflows/_build_torch_with_cuda.yml
+    needs: get-torch-commit
+    with:
+      # TODO: bump CUDA version to either 12.4 or 12.6 (supported by PyTorch). 
+      # Ref: https://github.com/pytorch/xla/issues/8700
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+      # note that to build a torch wheel with CUDA enabled, we do not need a GPU runner.
+      runner: linux.24xlarge
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # build-cuda-plugin:
-  #   name: "Build XLA CUDA plugin"
-  #   uses: ./.github/workflows/_build_plugin.yml
-  #   with:
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #   secrets:
-  #     gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+  build-cuda-plugin:
+    name: "Build XLA CUDA plugin"
+    uses: ./.github/workflows/_build_plugin.yml
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
   test-python-cpu:
     name: "CPU tests"
@@ -74,32 +74,30 @@ jobs:
     secrets:
       gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # test-cuda:
-  #   name: "GPU tests"
-  #   uses: ./.github/workflows/_test.yml
-  #   needs: [build-torch-xla, build-cuda-plugin, get-torch-commit]
-  #   with:
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #     runner: linux.8xlarge.nvidia.gpu
-  #     timeout-minutes: 300
-  #     collect-coverage: false
-  #     install-cuda-plugin: true
-  #     torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
-  #   secrets:
-  #     gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
+  test-cuda:
+    name: "GPU tests"
+    uses: ./.github/workflows/_test.yml
+    needs: [build-torch-xla, build-cuda-plugin, get-torch-commit]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+      runner: linux.g4dn.12xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+      install-cuda-plugin: true
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+    secrets:
+      gcloud-service-key: ${{ secrets.GCLOUD_SERVICE_KEY }}
 
-  # Disable due to https://github.com/pytorch/xla/issues/8199
-  # test-cuda-with-pytorch-cuda-enabled:
-  #   name: "GPU tests requiring torch CUDA"
-  #   uses: ./.github/workflows/_test_requiring_torch_cuda.yml
-  #   needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit]
-  #   with:
-  #     dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.1
-  #     runner: linux.8xlarge.nvidia.gpu
-  #     timeout-minutes: 300
-  #     collect-coverage: false
-  #     torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
+  test-cuda-with-pytorch-cuda-enabled:
+    name: "GPU tests requiring torch CUDA"
+    uses: ./.github/workflows/_test_requiring_torch_cuda.yml
+    needs: [build-torch-with-cuda, build-torch-xla, build-cuda-plugin, get-torch-commit]
+    with:
+      dev-image: us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/development:3.10_cuda_12.3
+      runner: linux.8xlarge.nvidia.gpu
+      timeout-minutes: 300
+      collect-coverage: false
+      torch-commit: ${{needs.get-torch-commit.outputs.torch_commit}}
 
   test-tpu:
     name: "TPU tests"
diff --git a/.github/workflows/setup/action.yml b/.github/workflows/setup/action.yml
index 6953f99cac36..574b85e5b0d5 100644
--- a/.github/workflows/setup/action.yml
+++ b/.github/workflows/setup/action.yml
@@ -29,8 +29,8 @@ runs:
     - name: Setup CUDA environment
       shell: bash
       run: |
-        echo "PATH=$PATH:/usr/local/cuda-12.1/bin" >> $GITHUB_ENV
-        echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/lib64" >> $GITHUB_ENV
+        echo "PATH=$PATH:/usr/local/cuda-12.3/bin" >> $GITHUB_ENV
+        echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.3/lib64" >> $GITHUB_ENV
       if: ${{ inputs.cuda }}
     - name: Setup gcloud
       shell: bash
diff --git a/infra/ansible/config/vars.yaml b/infra/ansible/config/vars.yaml
index 120a6ae7ae93..845264ceb691 100644
--- a/infra/ansible/config/vars.yaml
+++ b/infra/ansible/config/vars.yaml
@@ -2,7 +2,7 @@
 cuda_repo: debian11
 cuda_version: "11.8"
 # Determines supported GPUs. See https://developer.nvidia.com/cuda-gpus
-cuda_compute_capabilities: 7.0,7.5,8.0,9.0
+cuda_compute_capabilities: 5.2,7.0,7.5,8.0,9.0
 # Used for fetching clang from the right repo, see apt.yaml.
 llvm_debian_repo: bullseye
 clang_version: 17
diff --git a/test/test_triton.py b/test/test_triton.py
index 3854b790cdbe..aa87b9884a7e 100644
--- a/test/test_triton.py
+++ b/test/test_triton.py
@@ -6,6 +6,7 @@
 import torch_xla.experimental.triton as xla_triton
 import torch_xla
 from torch_xla import runtime as xr
+from torch_xla.test.test_utils import skipIfCUDA
 
 import triton
 import triton.language as tl
@@ -241,6 +242,8 @@ def _attn_fwd(
   tl.store(O_block_ptr, acc.to(Out.type.element_ty))
 
 
+# Ref: https://github.com/pytorch/xla/pull/8593
+@skipIfCUDA("GPU CI is failing")
 class TritonTest(unittest.TestCase):
 
   @unittest.skipIf(xr.device_type() != 'CUDA', "This test only works on GPU.")
diff --git a/test/torch_distributed/test_ddp.py b/test/torch_distributed/test_ddp.py
index 61a8ef8a5935..d4e3fc77c7f2 100644
--- a/test/torch_distributed/test_ddp.py
+++ b/test/torch_distributed/test_ddp.py
@@ -3,6 +3,7 @@
 import sys
 import torch_xla
 import torch_xla.core.xla_model as xm
+from torch_xla.test.test_utils import skipIfCUDA
 
 # Setup import folders.
 xla_test_folder = os.path.dirname(os.path.dirname(os.path.abspath(sys.argv[0])))
@@ -38,6 +39,8 @@ def _ddp_correctness(rank,
   def test_ddp_correctness(self):
     torch_xla.launch(self._ddp_correctness, args=(False, FLAGS.debug))
 
+  # Ref: https://github.com/pytorch/xla/pull/8593
+  @skipIfCUDA("GPU CI is failing")
   def test_ddp_correctness_with_gradient_as_bucket_view(self):
     torch_xla.launch(self._ddp_correctness, args=(False, FLAGS.debug, True))
 
diff --git a/torch_xla/test/test_utils.py b/torch_xla/test/test_utils.py
index 92b5dcb111f2..6e9f779c5f0a 100644
--- a/torch_xla/test/test_utils.py
+++ b/torch_xla/test/test_utils.py
@@ -11,6 +11,11 @@
 import torch_xla.utils.utils as xu
 
 
+def skipIfCUDA(reason):
+  accelerator = xr.device_type() or ""
+  return lambda f: unittest.skipIf(accelerator.lower() == "cuda", reason)(f)
+
+
 def mp_test(func):
   """Wraps a `unittest.TestCase` function running it within an isolated process.