NVIDIA
diff --git a/‎.github/workflows/blossom-ci.yml
+5-2 b/‎.github/workflows/blossom-ci.yml
+5-2
diff --git a/‎.github/workflows/build.yml
+10-4 b/‎.github/workflows/build.yml
+10-4
diff --git a/‎.github/workflows/trigger-ci.yml
+18-2 b/‎.github/workflows/trigger-ci.yml
+18-2
diff --git a/‎3rdparty/cudnn-frontend b/‎3rdparty/cudnn-frontend
diff --git a/‎README.rst
+3-3 b/‎README.rst
+3-3
diff --git a/‎benchmarks/attention/benchmark_attention.py
+7-13 b/‎benchmarks/attention/benchmark_attention.py
+7-13
diff --git a/‎build_tools/VERSION.txt
+1-1 b/‎build_tools/VERSION.txt
+1-1
diff --git a/‎build_tools/build_ext.py
+7-2 b/‎build_tools/build_ext.py
+7-2
diff --git a/‎build_tools/jax.py
+23-1 b/‎build_tools/jax.py
+23-1
diff --git a/‎build_tools/paddle.py
+13-6 b/‎build_tools/paddle.py
+13-6
diff --git a/‎build_tools/pytorch.py
+12-6 b/‎build_tools/pytorch.py
+12-6
@@ -23,9 +23,12 @@ jobs:
       args: ${{ env.args }}
 
     # This job only runs for pull request comments
-    if: |
-         contains( ',ptrendx,ksivaman,', format(',{0},', github.actor)) &&
+    if: >
          github.event.comment.body == '/blossom-ci'
+         && (
+           github.actor == 'ptrendx'
+           || github.actor == 'ksivaman'
+         )
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci
 
@@ -12,7 +12,7 @@ jobs:
     name: 'Core'
     runs-on: ubuntu-latest
     container:
-      image: nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
+      image: nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu22.04
       options: --user root
     steps:
       - name: 'Dependencies'
@@ -35,9 +35,14 @@ jobs:
     name: 'PyTorch'
     runs-on: ubuntu-latest
     container:
-      image: nvcr.io/nvidia/pytorch:24.05-py3
+      image: nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
       options: --user root
     steps:
+      - name: 'Dependencies'
+        run: |
+          apt-get update
+          apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
+          pip install cmake torch pydantic importlib-metadata>=1.0 packaging pybind11
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -48,7 +53,8 @@ jobs:
           NVTE_FRAMEWORK: pytorch
           MAX_JOBS: 1
       - name: 'Sanity check'
-        run: python tests/pytorch/test_sanity_import.py
+        if: false  # Sanity import test requires Flash Attention
+        run: python3 tests/pytorch/test_sanity_import.py
   jax:
     name: 'JAX'
     runs-on: ubuntu-latest
@@ -70,7 +76,7 @@ jobs:
     name: 'PaddlePaddle'
     runs-on: ubuntu-latest
     container:
-      image: nvcr.io/nvidia/paddlepaddle:24.05-py3
+      image: nvcr.io/nvidia/paddlepaddle:24.07-py3
       options: --user root
     steps:
       - name: 'Checkout'
 
@@ -15,9 +15,25 @@ jobs:
       args: ${{ env.args }}
 
     # This job only runs for pull request comments
-    if: |
-         contains( ',ptrendx,ksivaman,schetlur-nv,timmoon10,zlsh80826,mingxu1067,cyanguwa,nzmora-nvidia,galagam,nouiz,denera,sudhakarsingh27,Oleg-Goncharov,phu0ngng,nvcforster,', format(',{0},', github.actor)) &&
+    if: >
          startsWith(github.event.comment.body, '/te-ci')
+         && (
+           github.actor == 'ptrendx'
+           || github.actor == 'ksivaman'
+           || github.actor == 'schetlur-nv'
+           || github.actor == 'timmoon10'
+           || github.actor == 'zlsh80826'
+           || github.actor == 'mingxu1067'
+           || github.actor == 'cyanguwa'
+           || github.actor == 'nzmora-nvidia'
+           || github.actor == 'galagam'
+           || github.actor == 'nouiz'
+           || github.actor == 'denera'
+           || github.actor == 'sudhakarsingh27'
+           || github.actor == 'Oleg-Goncharov'
+           || github.actor == 'phu0ngng'
+           || github.actor == 'xrennvidia'
+         )
     steps:
       - name: Check if comment is issued by authorized person
         run: blossom-ci
 
@@ -149,8 +149,8 @@ Installation
 Pre-requisites
 ^^^^^^^^^^^^^^^^^^^^
 * Linux x86_64
-* CUDA 11.8+ for Hopper and CUDA 12.1+ for Ada
-* NVIDIA Driver supporting CUDA 11.8 or later
+* CUDA 12.0+ for Hopper and CUDA 12.1+ for Ada
+* NVIDIA Driver supporting CUDA 12.0 or later
 * cuDNN 8.1 or later
 * For fused attention, CUDA 12.1 or later, NVIDIA Driver supporting CUDA 12.1 or later, and cuDNN 8.9 or later.
 
@@ -182,7 +182,7 @@ From source
 
 Compiling with FlashAttention-2
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Transformer Engine release v0.11.0 adds support for FlashAttention-2 in PyTorch for improved performance. 
+Transformer Engine release v0.11.0 adds support for FlashAttention-2 in PyTorch for improved performance.
 
 It is a known issue that FlashAttention-2 compilation is resource-intensive and requires a large amount of RAM (see `bug <https://github.com/Dao-AILab/flash-attention/issues/358>`_), which may lead to out of memory errors during the installation of Transformer Engine. Please try setting **MAX_JOBS=1** in the environment to circumvent the issue.
 
 
@@ -11,9 +11,7 @@
 import transformer_engine
 from tests.pytorch.fused_attn.test_fused_attn import (
     ModelConfig,
-    _is_flash_attention_supported,
-    _is_fused_attention_supported,
-    _is_unfused_attention_supported,
+    _get_attention_backends,
     _run_dot_product_attention,
 )
 
@@ -29,8 +27,6 @@
 workspace_opt = True
 # QKV memory layout
 qkv_layout = "bshd_bshd_bshd"
-# sliding window attention
-swa = False
 # padding between sequences for qkv_format=thd
 pad_between_seqs = False
 # training mode
@@ -64,7 +60,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
                 ckpt_attn,
                 qkv_layout,
                 workspace_opt,
-                swa,
                 pad_between_seqs,
                 is_training,
             )
@@ -76,7 +71,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
                 ckpt_attn,
                 qkv_layout,
                 workspace_opt,
-                swa,
                 pad_between_seqs,
                 is_training,
             )
@@ -97,7 +91,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
                 ckpt_attn,
                 qkv_layout,
                 workspace_opt,
-                swa,
                 pad_between_seqs,
                 is_training,
             )
@@ -115,7 +108,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
                 ckpt_attn,
                 qkv_layout,
                 workspace_opt,
-                swa,
                 pad_between_seqs,
                 is_training,
             )
@@ -205,13 +197,15 @@ def main():
     )
     for model in model_configs.keys():
         config = model_configs[model]
-        fused_attn_supported, fused_attn_backend = _is_fused_attention_supported(
+        available_backends, fused_attn_backends = _get_attention_backends(
             config,
-            dtype,
+            qkv_dtype=dtype,
             qkv_layout=qkv_layout,
+            window_size=config.window_size,
+            pad_between_seqs=pad_between_seqs,
         )
-        fused_attn_supported = fused_attn_supported and not swa
-        flash_attn_supported = _is_flash_attention_supported(config)
+        flash_attn_supported, fused_attn_supported, unfused_attn_supported = available_backends
+
         print(
             f'Running {model} with {"cuDNN attention" if fused_attn_supported else ""}'
             f'{" and flash-attention" if flash_attn_supported else ""}...'
 
@@ -1 +1 @@
-1.9.0
+1.10.0
@@ -10,6 +10,7 @@
 import sys
 import sysconfig
 import copy
+import time
 
 from pathlib import Path
 from subprocess import CalledProcessError
@@ -69,8 +70,8 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
         configure_command.append(f"-Dpybind11_DIR={pybind11_dir}")
 
         # CMake build and install commands
-        build_command = [_cmake_bin, "--build", build_dir]
-        install_command = [_cmake_bin, "--install", build_dir]
+        build_command = [_cmake_bin, "--build", build_dir, "--verbose"]
+        install_command = [_cmake_bin, "--install", build_dir, "--verbose"]
 
         # Check whether parallel build is restricted
         max_jobs = get_max_jobs_for_parallel_build()
@@ -81,13 +82,17 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
             build_command.append(str(max_jobs))
 
         # Run CMake commands
+        start_time = time.perf_counter()
         for command in [configure_command, build_command, install_command]:
             print(f"Running command {' '.join(command)}")
             try:
                 subprocess.run(command, cwd=build_dir, check=True)
             except (CalledProcessError, OSError) as e:
                 raise RuntimeError(f"Error when running CMake: {e}")
 
+        total_time = time.perf_counter() - start_time
+        print(f"Time for build_ext: {total_time:.2f} seconds")
+
 
 def get_build_ext(extension_cls: Type[setuptools.Extension]):
     class _CMakeBuildExtension(extension_cls):
 
@@ -2,7 +2,8 @@
 #
 # See LICENSE for license information.
 
-"""Paddle-paddle related extensions."""
+"""JAX related extensions."""
+import os
 from pathlib import Path
 
 import setuptools
@@ -12,6 +13,25 @@
 from typing import List
 
 
+def xla_path() -> str:
+    """XLA root path lookup.
+    Throws FileNotFoundError if XLA source is not found."""
+
+    try:
+        from jax.extend import ffi
+    except ImportError:
+        if os.getenv("XLA_HOME"):
+            xla_home = Path(os.getenv("XLA_HOME"))
+        else:
+            xla_home = "/opt/xla"
+    else:
+        xla_home = ffi.include_dir()
+
+    if not os.path.isdir(xla_home):
+        raise FileNotFoundError("Could not find xla source.")
+    return xla_home
+
+
 def setup_jax_extension(
     csrc_source_files,
     csrc_header_files,
@@ -27,12 +47,14 @@ def setup_jax_extension(
 
     # Header files
     cuda_home, _ = cuda_path()
+    xla_home = xla_path()
     include_dirs = [
         cuda_home / "include",
         common_header_files,
         common_header_files / "common",
         common_header_files / "common" / "include",
         csrc_header_files,
+        xla_home,
     ]
 
     # Compile flags
 
@@ -6,6 +6,7 @@
 from pathlib import Path
 
 import setuptools
+import os
 
 from .utils import cuda_version
 
@@ -61,12 +62,18 @@ def setup_paddle_extension(
     except FileNotFoundError:
         print("Could not determine CUDA Toolkit version")
     else:
-        if version >= (11, 2):
-            nvcc_flags.extend(["--threads", "4"])
-        if version >= (11, 0):
-            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
-        if version >= (11, 8):
-            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+        if version < (12, 0):
+            raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
+        nvcc_flags.extend(
+            (
+                "--threads",
+                os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
+                "-gencode",
+                "arch=compute_80,code=sm_80",
+                "-gencode",
+                "arch=compute_90,code=sm_90",
+            )
+        )
 
     # Construct Paddle CUDA extension
     sources = [str(path) for path in sources]
 
@@ -67,12 +67,18 @@ def setup_pytorch_extension(
     except FileNotFoundError:
         print("Could not determine CUDA Toolkit version")
     else:
-        if version >= (11, 2):
-            nvcc_flags.extend(["--threads", "4"])
-        if version >= (11, 0):
-            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
-        if version >= (11, 8):
-            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+        if version < (12, 0):
+            raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
+        nvcc_flags.extend(
+            (
+                "--threads",
+                os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
+                "-gencode",
+                "arch=compute_80,code=sm_80",
+                "-gencode",
+                "arch=compute_90,code=sm_90",
+            )
+        )
 
     # Libraries
     library_dirs = []