Skip to content

Commit

Permalink
Release v1.10
Browse files Browse the repository at this point in the history
  • Loading branch information
ptrendx committed Sep 11, 2024
2 parents e79d915 + a7e9d3e commit 08a85d3
Show file tree
Hide file tree
Showing 129 changed files with 7,742 additions and 2,560 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/blossom-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,12 @@ jobs:
args: ${{ env.args }}

# This job only runs for pull request comments
if: |
contains( ',ptrendx,ksivaman,', format(',{0},', github.actor)) &&
if: >
github.event.comment.body == '/blossom-ci'
&& (
github.actor == 'ptrendx'
|| github.actor == 'ksivaman'
)
steps:
- name: Check if comment is issued by authorized person
run: blossom-ci
Expand Down
14 changes: 10 additions & 4 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
name: 'Core'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
image: nvcr.io/nvidia/cuda:12.0.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
Expand All @@ -35,9 +35,14 @@ jobs:
name: 'PyTorch'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/pytorch:24.05-py3
image: nvcr.io/nvidia/cuda:12.5.0-devel-ubuntu22.04
options: --user root
steps:
- name: 'Dependencies'
run: |
apt-get update
apt-get install -y git python3.9 pip ninja-build cudnn9-cuda-12
pip install cmake torch pydantic importlib-metadata>=1.0 packaging pybind11
- name: 'Checkout'
uses: actions/checkout@v3
with:
Expand All @@ -48,7 +53,8 @@ jobs:
NVTE_FRAMEWORK: pytorch
MAX_JOBS: 1
- name: 'Sanity check'
run: python tests/pytorch/test_sanity_import.py
if: false # Sanity import test requires Flash Attention
run: python3 tests/pytorch/test_sanity_import.py
jax:
name: 'JAX'
runs-on: ubuntu-latest
Expand All @@ -70,7 +76,7 @@ jobs:
name: 'PaddlePaddle'
runs-on: ubuntu-latest
container:
image: nvcr.io/nvidia/paddlepaddle:24.05-py3
image: nvcr.io/nvidia/paddlepaddle:24.07-py3
options: --user root
steps:
- name: 'Checkout'
Expand Down
20 changes: 18 additions & 2 deletions .github/workflows/trigger-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,25 @@ jobs:
args: ${{ env.args }}

# This job only runs for pull request comments
if: |
contains( ',ptrendx,ksivaman,schetlur-nv,timmoon10,zlsh80826,mingxu1067,cyanguwa,nzmora-nvidia,galagam,nouiz,denera,sudhakarsingh27,Oleg-Goncharov,phu0ngng,nvcforster,', format(',{0},', github.actor)) &&
if: >
startsWith(github.event.comment.body, '/te-ci')
&& (
github.actor == 'ptrendx'
|| github.actor == 'ksivaman'
|| github.actor == 'schetlur-nv'
|| github.actor == 'timmoon10'
|| github.actor == 'zlsh80826'
|| github.actor == 'mingxu1067'
|| github.actor == 'cyanguwa'
|| github.actor == 'nzmora-nvidia'
|| github.actor == 'galagam'
|| github.actor == 'nouiz'
|| github.actor == 'denera'
|| github.actor == 'sudhakarsingh27'
|| github.actor == 'Oleg-Goncharov'
|| github.actor == 'phu0ngng'
|| github.actor == 'xrennvidia'
)
steps:
- name: Check if comment is issued by authorized person
run: blossom-ci
Expand Down
2 changes: 1 addition & 1 deletion 3rdparty/cudnn-frontend
Submodule cudnn-frontend updated 118 files
6 changes: 3 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ Installation
Pre-requisites
^^^^^^^^^^^^^^^^^^^^
* Linux x86_64
* CUDA 11.8+ for Hopper and CUDA 12.1+ for Ada
* NVIDIA Driver supporting CUDA 11.8 or later
* CUDA 12.0+ for Hopper and CUDA 12.1+ for Ada
* NVIDIA Driver supporting CUDA 12.0 or later
* cuDNN 8.1 or later
* For fused attention, CUDA 12.1 or later, NVIDIA Driver supporting CUDA 12.1 or later, and cuDNN 8.9 or later.

Expand Down Expand Up @@ -182,7 +182,7 @@ From source

Compiling with FlashAttention-2
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Transformer Engine release v0.11.0 adds support for FlashAttention-2 in PyTorch for improved performance.
Transformer Engine release v0.11.0 adds support for FlashAttention-2 in PyTorch for improved performance.

It is a known issue that FlashAttention-2 compilation is resource-intensive and requires a large amount of RAM (see `bug <https://github.com/Dao-AILab/flash-attention/issues/358>`_), which may lead to out of memory errors during the installation of Transformer Engine. Please try setting **MAX_JOBS=1** in the environment to circumvent the issue.

Expand Down
20 changes: 7 additions & 13 deletions benchmarks/attention/benchmark_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@
import transformer_engine
from tests.pytorch.fused_attn.test_fused_attn import (
ModelConfig,
_is_flash_attention_supported,
_is_fused_attention_supported,
_is_unfused_attention_supported,
_get_attention_backends,
_run_dot_product_attention,
)

Expand All @@ -29,8 +27,6 @@
workspace_opt = True
# QKV memory layout
qkv_layout = "bshd_bshd_bshd"
# sliding window attention
swa = False
# padding between sequences for qkv_format=thd
pad_between_seqs = False
# training mode
Expand Down Expand Up @@ -64,7 +60,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
ckpt_attn,
qkv_layout,
workspace_opt,
swa,
pad_between_seqs,
is_training,
)
Expand All @@ -76,7 +71,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
ckpt_attn,
qkv_layout,
workspace_opt,
swa,
pad_between_seqs,
is_training,
)
Expand All @@ -97,7 +91,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
ckpt_attn,
qkv_layout,
workspace_opt,
swa,
pad_between_seqs,
is_training,
)
Expand All @@ -115,7 +108,6 @@ def benchmark_dot_product_attention(model, fused_attn_supported, flash_attn_supp
ckpt_attn,
qkv_layout,
workspace_opt,
swa,
pad_between_seqs,
is_training,
)
Expand Down Expand Up @@ -205,13 +197,15 @@ def main():
)
for model in model_configs.keys():
config = model_configs[model]
fused_attn_supported, fused_attn_backend = _is_fused_attention_supported(
available_backends, fused_attn_backends = _get_attention_backends(
config,
dtype,
qkv_dtype=dtype,
qkv_layout=qkv_layout,
window_size=config.window_size,
pad_between_seqs=pad_between_seqs,
)
fused_attn_supported = fused_attn_supported and not swa
flash_attn_supported = _is_flash_attention_supported(config)
flash_attn_supported, fused_attn_supported, unfused_attn_supported = available_backends

print(
f'Running {model} with {"cuDNN attention" if fused_attn_supported else ""}'
f'{" and flash-attention" if flash_attn_supported else ""}...'
Expand Down
2 changes: 1 addition & 1 deletion build_tools/VERSION.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.9.0
1.10.0
9 changes: 7 additions & 2 deletions build_tools/build_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import sys
import sysconfig
import copy
import time

from pathlib import Path
from subprocess import CalledProcessError
Expand Down Expand Up @@ -69,8 +70,8 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
configure_command.append(f"-Dpybind11_DIR={pybind11_dir}")

# CMake build and install commands
build_command = [_cmake_bin, "--build", build_dir]
install_command = [_cmake_bin, "--install", build_dir]
build_command = [_cmake_bin, "--build", build_dir, "--verbose"]
install_command = [_cmake_bin, "--install", build_dir, "--verbose"]

# Check whether parallel build is restricted
max_jobs = get_max_jobs_for_parallel_build()
Expand All @@ -81,13 +82,17 @@ def _build_cmake(self, build_dir: Path, install_dir: Path) -> None:
build_command.append(str(max_jobs))

# Run CMake commands
start_time = time.perf_counter()
for command in [configure_command, build_command, install_command]:
print(f"Running command {' '.join(command)}")
try:
subprocess.run(command, cwd=build_dir, check=True)
except (CalledProcessError, OSError) as e:
raise RuntimeError(f"Error when running CMake: {e}")

total_time = time.perf_counter() - start_time
print(f"Time for build_ext: {total_time:.2f} seconds")


def get_build_ext(extension_cls: Type[setuptools.Extension]):
class _CMakeBuildExtension(extension_cls):
Expand Down
24 changes: 23 additions & 1 deletion build_tools/jax.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
#
# See LICENSE for license information.

"""Paddle-paddle related extensions."""
"""JAX related extensions."""
import os
from pathlib import Path

import setuptools
Expand All @@ -12,6 +13,25 @@
from typing import List


def xla_path() -> str:
"""XLA root path lookup.
Throws FileNotFoundError if XLA source is not found."""

try:
from jax.extend import ffi
except ImportError:
if os.getenv("XLA_HOME"):
xla_home = Path(os.getenv("XLA_HOME"))
else:
xla_home = "/opt/xla"
else:
xla_home = ffi.include_dir()

if not os.path.isdir(xla_home):
raise FileNotFoundError("Could not find xla source.")
return xla_home


def setup_jax_extension(
csrc_source_files,
csrc_header_files,
Expand All @@ -27,12 +47,14 @@ def setup_jax_extension(

# Header files
cuda_home, _ = cuda_path()
xla_home = xla_path()
include_dirs = [
cuda_home / "include",
common_header_files,
common_header_files / "common",
common_header_files / "common" / "include",
csrc_header_files,
xla_home,
]

# Compile flags
Expand Down
19 changes: 13 additions & 6 deletions build_tools/paddle.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path

import setuptools
import os

from .utils import cuda_version

Expand Down Expand Up @@ -61,12 +62,18 @@ def setup_paddle_extension(
except FileNotFoundError:
print("Could not determine CUDA Toolkit version")
else:
if version >= (11, 2):
nvcc_flags.extend(["--threads", "4"])
if version >= (11, 0):
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
if version >= (11, 8):
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
if version < (12, 0):
raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
nvcc_flags.extend(
(
"--threads",
os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
"-gencode",
"arch=compute_80,code=sm_80",
"-gencode",
"arch=compute_90,code=sm_90",
)
)

# Construct Paddle CUDA extension
sources = [str(path) for path in sources]
Expand Down
18 changes: 12 additions & 6 deletions build_tools/pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,18 @@ def setup_pytorch_extension(
except FileNotFoundError:
print("Could not determine CUDA Toolkit version")
else:
if version >= (11, 2):
nvcc_flags.extend(["--threads", "4"])
if version >= (11, 0):
nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
if version >= (11, 8):
nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
if version < (12, 0):
raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
nvcc_flags.extend(
(
"--threads",
os.getenv("NVTE_BUILD_THREADS_PER_JOB", "1"),
"-gencode",
"arch=compute_80,code=sm_80",
"-gencode",
"arch=compute_90,code=sm_90",
)
)

# Libraries
library_dirs = []
Expand Down
Loading

0 comments on commit 08a85d3

Please sign in to comment.