Integrate HSTU into OSS CI (#4236)

q10 · facebook-github-bot · commit 2f2a1ef55557 · 2025-06-11T18:02:01.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1396 - Integrate HSTU build into OSS CI - Earlier draft of the work: #4251 Pull Request resolved: #4236 Reviewed By: ionuthristodorescu Differential Revision: D76445631 Pulled By: q10 fbshipit-source-id: 4b1eafc557b2db4480080182c2759689e0bead2f
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -260,16 +260,27 @@ __configure_fbgemm_gpu_build_cuda () {
     #   https://github.com/NVIDIA/nvbench/discussions/129
     #   https://github.com/vllm-project/vllm/blob/main/CMakeLists.txt#L187
     #   https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp#L224
+
+    # NOTE: It turns out that the order of the arch_list matters, and that
+    # appending 7.0/7.5 to the back of the list mysteriously results in
+    # undefined symbol errors on .SO loads
+    if [[ $fbgemm_build_target == "hstu" ]]; then
+      # HSTU requires sm_75 or higher
+      local arch_list="7.5"
+    else
+      local arch_list="7.0"
+    fi
+
     if    [[ $cuda_version_nvcc == *"V12.8"* ]]; then
-      local arch_list="7.0;8.0;9.0a;10.0a;12.0a"
+      local arch_list="${arch_list};8.0;9.0a;10.0a;12.0a"
 
     elif  [[ $cuda_version_nvcc == *"V12.6"* ]] ||
           [[ $cuda_version_nvcc == *"V12.4"* ]] ||
           [[ $cuda_version_nvcc == *"V12.1"* ]]; then
-      local arch_list="7.0;8.0;9.0a"
+      local arch_list="${arch_list};8.0;9.0a"
 
     else
-      local arch_list="7.0;8.0;9.0"
+      local arch_list="${arch_list};8.0;9.0"
     fi
   fi
   echo "[BUILD] Setting the following CUDA targets: ${arch_list}"
@@ -474,31 +485,29 @@ __build_fbgemm_gpu_common_pre_steps () {
   # Private function that uses variables instantiated by its caller
 
   # Check C/C++ compilers are visible (the build scripts look specifically for `gcc`)
-  (test_binpath "${env_name}" cc) || return 1
-  (test_binpath "${env_name}" gcc) || return 1
-  (test_binpath "${env_name}" c++) || return 1
-  (test_binpath "${env_name}" g++) || return 1
+  (test_binpath "${env_name}" cc)   || return 1
+  (test_binpath "${env_name}" gcc)  || return 1
+  (test_binpath "${env_name}" c++)  || return 1
+  (test_binpath "${env_name}" g++)  || return 1
 
   # Set the default the FBGEMM build variant to be default (i.e. FBGEMM_GPU)
-  if  [ "$fbgemm_build_target" != "genai" ] &&
-      [ "$fbgemm_build_target" != "default" ]; then
+  # shellcheck disable=SC2076
+  if [[ ! " genai hstu default " =~ " $fbgemm_build_target " ]]; then
     echo "################################################################################"
     echo "[BUILD] Unknown FBGEMM build TARGET: ${fbgemm_build_target}"
-    echo "[BUILD] Defaulting to 'default'"
+    echo "[BUILD] Exiting ..."
     echo "################################################################################"
-    export fbgemm_build_target="default"
+    return 1
   fi
 
   # Set the default the FBGEMM build variant to be CUDA
-  if  [ "$fbgemm_build_variant" != "docs" ] &&
-      [ "$fbgemm_build_variant" != "cpu" ] &&
-      [ "$fbgemm_build_variant" != "cuda" ] &&
-      [ "$fbgemm_build_variant" != "rocm" ]; then
+  # shellcheck disable=SC2076
+  if [[ ! " docs cpu cuda rocm " =~ " $fbgemm_build_variant " ]]; then
     echo "################################################################################"
     echo "[BUILD] Unknown FBGEMM build VARIANT: ${fbgemm_build_variant}"
-    echo "[BUILD] Defaulting to CUDA"
+    echo "[BUILD] Exiting ..."
     echo "################################################################################"
-    export fbgemm_build_variant="cuda"
+    return 1
   fi
 
   # Extract and set the Python tag
@@ -603,6 +612,11 @@ __verify_library_symbols () {
       )
     fi
 
+  elif [ "${fbgemm_build_target}" == "hstu" ]; then
+    local lib_symbols_to_check=(
+      fbgemm_gpu::hstu::set_params_fprop
+    )
+
   else
     local lib_symbols_to_check=(
       fbgemm_gpu::asynchronous_inclusive_cumsum_cpu
diff --git a/.github/scripts/fbgemm_gpu_install.bash b/.github/scripts/fbgemm_gpu_install.bash
@@ -77,7 +77,7 @@ __install_check_subpackages () {
     "fbgemm_gpu.tbe.cache"
   )
 
-  if [ "$installed_fbgemm_target" != "genai" ]; then
+  if [ "$installed_fbgemm_target" == "default" ]; then
     subpackages+=(
       "fbgemm_gpu.split_embedding_codegen_lookup_invokers"
       "fbgemm_gpu.tbe.ssd"
@@ -91,10 +91,13 @@ __install_check_subpackages () {
 }
 
 __install_check_operator_registrations () {
+  # shellcheck disable=SC2155
   local env_prefix=$(env_name_or_prefix "${env_name}")
 
   local test_operators=()
+  local base_import="fbgemm_gpu"
   echo "[INSTALL] Check for operator registrations ..."
+
   if [ "$installed_fbgemm_target" == "genai" ]; then
     # NOTE: Currently, ROCm builds of GenAI only include quantization
     # operators.
@@ -115,7 +118,13 @@ __install_check_operator_registrations () {
       fi
     fi
 
-  else
+  elif [ "$installed_fbgemm_target" == "hstu" ]; then
+    test_operators+=(
+      "torch.ops.fbgemm.hstu_varlen_bwd_80"
+    )
+    base_import="fbgemm_gpu.experimental.hstu"
+
+  elif [ "$installed_fbgemm_target" == "genai" ]; then
     test_operators+=(
       "torch.ops.fbgemm.asynchronous_inclusive_cumsum"
       "torch.ops.fbgemm.split_embedding_codegen_lookup_sgd_function_pt2"
@@ -124,7 +133,7 @@ __install_check_operator_registrations () {
 
   for operator in "${test_operators[@]}"; do
     # shellcheck disable=SC2086
-    if conda run ${env_prefix} python -c "import torch; import fbgemm_gpu; print($operator)"; then
+    if conda run ${env_prefix} python -c "import torch; import ${base_import}; print($operator)"; then
       echo "[CHECK] FBGEMM_GPU operator appears to be correctly registered: $operator"
     else
       echo "################################################################################"
diff --git a/.github/scripts/fbgemm_gpu_test.bash b/.github/scripts/fbgemm_gpu_test.bash
@@ -184,9 +184,8 @@ __setup_fbgemm_gpu_test () {
     print_exec conda env config vars set ${env_prefix} KMP_DUPLICATE_LIB_OK=1
   fi
 
-  # NOTE: Uncomment to enable PyTorch C++ stacktraces
   # shellcheck disable=SC2086
-  # print_exec conda env config vars set ${env_prefix} TORCH_SHOW_CPP_STACKTRACES=1
+  print_exec conda env config vars set ${env_prefix} TORCH_SHOW_CPP_STACKTRACES=1
 
   echo "[TEST] Installing PyTest ..."
   # shellcheck disable=SC2086
@@ -267,6 +266,11 @@ __determine_test_directories () {
       )
     fi
 
+  elif [ "$fbgemm_build_target" == "hstu" ]; then
+    target_directories+=(
+      fbgemm_gpu/experimental/hstu/test
+    )
+
   else
     target_directories+=(
       fbgemm_gpu/test
diff --git a/.github/workflows/fbgemm_gpu_ci_cuda.yml b/.github/workflows/fbgemm_gpu_ci_cuda.yml
@@ -3,9 +3,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# This workflow is used for FBGEMM GPU/GenAI CUDA CI as well as nightly builds
-# of FBGEMM GPU/GenAI CUDA against PyTorch-CUDA Nightly.
-name: FBGEMM GPU/GenAI CUDA CI
+# This workflow is used for FBGEMM GPU/GenAI/HSTU CUDA CI as well as nightly
+# builds of FBGEMM GPU/GenAI/HSTU CUDA against PyTorch-CUDA Nightly.
+name: FBGEMM GPU/GenAI/HSTU CUDA CI
 
 on:
   # PR Trigger (enabled for regression checks and debugging)
@@ -74,9 +74,13 @@ jobs:
           { arch: x86, instance: "linux.24xlarge", build-target: "default", cuda-version: "11.8.0" },
           { arch: x86, instance: "linux.24xlarge", build-target: "default", cuda-version: "12.6.3" },
           { arch: x86, instance: "linux.24xlarge", build-target: "default", cuda-version: "12.8.0" },
+
           # GenAI is unable to support 11.8.0 anymore as of https://github.com/pytorch/FBGEMM/pull/4138
           { arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.6.3" },
           { arch: x86, instance: "linux.8xlarge.memory", build-target: "genai", cuda-version: "12.8.0" },
+
+          { arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.6.3" },
+          { arch: x86, instance: "linux.12xlarge.memory", build-target: "hstu", cuda-version: "12.8.0" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
         compiler: [ "gcc", "clang" ]
@@ -167,6 +171,8 @@ jobs:
           { build-target: "default", cuda-version: "12.8.0" },
           { build-target: "genai", cuda-version: "12.6.3" },
           { build-target: "genai", cuda-version: "12.8.0" },
+          { build-target: "hstu", cuda-version: "12.6.3" },
+          { build-target: "hstu", cuda-version: "12.8.0" },
         ]
         python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
         # Specify exactly ONE CUDA version for artifact publish
diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -283,8 +283,7 @@ if(FBGEMM_BUILD_TARGET STREQUAL BUILD_TARGET_GENAI)
   add_subdirectory(experimental/gemm)
 
 elseif(FBGEMM_BUILD_TARGET STREQUAL BUILD_TARGET_HSTU)
-  if(FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CPU OR
-    FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_ROCM)
+  if(NOT FBGEMM_BUILD_VARIANT STREQUAL BUILD_VARIANT_CUDA)
     message(FATAL_ERROR
       "Unsupported (target, variant) combination:
       (${FBGEMM_BUILD_TARGET}, ${FBGEMM_BUILD_VARIANT})")
diff --git a/fbgemm_gpu/experimental/hstu/README.md b/fbgemm_gpu/experimental/hstu/README.md
@@ -1,6 +1,6 @@
 # FBGEMM HSTU
 
-FBGEMM HSTU(Hierarchical Sequential Transduction Units)
+FBGEMM HSTU (Hierarchical Sequential Transduction Units)
 
 # **1. Overview**
 
diff --git a/fbgemm_gpu/experimental/hstu/test/hstu_test.py b/fbgemm_gpu/experimental/hstu/test/hstu_test.py
@@ -9,6 +9,7 @@
 
 import logging
 import math
+import os
 import unittest
 from typing import Optional, Tuple
 
@@ -19,6 +20,8 @@
 
 from hypothesis import given, settings, strategies as st, Verbosity
 
+running_on_github: bool = os.getenv("GITHUB_ENV") is not None
+
 logger: logging.Logger = logging.getLogger()
 logger.setLevel(logging.INFO)
 
@@ -453,6 +456,9 @@ def _hstu_attention_maybe_from_cache(
 class HSTU16Test(unittest.TestCase):
     """Test HSTU attention with float16 inputs."""
 
+    @unittest.skipIf(
+        running_on_github, "GitHub runners are unable to run the test at this time"
+    )
     @given(
         batch_size=st.sampled_from([32]),
         heads=st.sampled_from([2]),
diff --git a/fbgemm_gpu/fbgemm_gpu/__init__.py b/fbgemm_gpu/fbgemm_gpu/__init__.py
@@ -17,7 +17,7 @@ def _load_library(filename: str, no_throw: bool = False) -> None:
         torch.ops.load_library(os.path.join(os.path.dirname(__file__), filename))
         logging.info(f"Successfully loaded: '{filename}'")
     except Exception as error:
-        logging.error(f"Could not load the library '{filename}': {error}")
+        logging.error(f"Could not load the library '{filename}'!\n\n\n{error}\n\n\n")
         if not no_throw:
             raise error
 
diff --git a/fbgemm_gpu/requirements.txt b/fbgemm_gpu/requirements.txt
@@ -14,6 +14,7 @@ backports.tarfile
 build
 cmake
 click
+einops
 hypothesis
 jinja2
 mpmath==1.3.0
diff --git a/fbgemm_gpu/requirements_genai.txt b/fbgemm_gpu/requirements_genai.txt
@@ -16,6 +16,7 @@ backports.tarfile
 build
 cmake
 click
+einops
 hypothesis
 jinja2
 mpmath==1.3.0