pytorch
diff --git a/‎.github/scripts/fbgemm_gpu_build.bash
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/fbgemm_gpu_build.bash
Lines changed: 1 addition & 1 deletion
diff --git a/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Lines changed: 44 additions & 1 deletion b/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
Lines changed: 44 additions & 1 deletion
diff --git a/‎fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
Lines changed: 54 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
Lines changed: 54 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16.cu
Lines changed: 197 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16.cu
Lines changed: 197 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_f.cu
Lines changed: 28 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_f.cu
Lines changed: 28 additions & 0 deletions
diff --git a/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_t.cu
Lines changed: 28 additions & 0 deletions b/‎fbgemm_gpu/experimental/gen_ai/src/quantize/cutlass_extensions/f4f4bf16/f4f4bf16_128_128_4_1_1_t.cu
Lines changed: 28 additions & 0 deletions
@@ -251,7 +251,7 @@ __configure_fbgemm_gpu_build_cuda () {
     #   https://github.com/vllm-project/vllm/blob/main/CMakeLists.txt#L187
     #   https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp#L224
     if    [[ $cuda_version_nvcc == *"V12.8"* ]]; then
-      local arch_list="7.0;8.0;9.0;9.0a;10.0;10.0a;12.0;12.0a"
+      local arch_list="7.0;8.0;9.0;9.0a;10.0a;12.0a"
 
     elif  [[ $cuda_version_nvcc == *"V12.6"* ]] ||
           [[ $cuda_version_nvcc == *"V12.4"* ]] ||
 
@@ -25,7 +25,10 @@
     grouped_gemm,
     grouped_gemm_fp8_rowwise,
 )
-from fbgemm_gpu.experimental.gen_ai.quantize import quantize_int4_preshuffle
+from fbgemm_gpu.experimental.gen_ai.quantize import (
+    quantize_int4_preshuffle,
+    scaled_fp4_quant,
+)
 
 try:
     from tinygemm.utils import group_quantize_tensor
@@ -1962,3 +1965,43 @@ def hip(self) -> bool:
     def cuda(self) -> bool:
         # This op is not always supported.
         return MACHETE_ENABLED
+
+
+@register_quantize_op
+class FP4Gemm(QuantizeOpBase):
+    """
+    FP4 matmul with block-wise scaling.
+    """
+
+    def quantize(self, x, w):
+        x_global_scale = ((448.0 * 6.0) / torch.amax(x.flatten(), dim=-1)).to(
+            torch.float32
+        )
+        w_global_scale = ((448.0 * 6.0) / torch.amax(w.flatten(), dim=-1)).to(
+            torch.float32
+        )
+        global_scale = 1 / (x_global_scale * w_global_scale)
+
+        xq, x_scale = scaled_fp4_quant(x, x_global_scale)
+        wq, w_scale = scaled_fp4_quant(w, w_global_scale)
+        return xq, wq, x_scale, w_scale, global_scale
+
+    def compute(self, xq, wq, x_scale, w_scale, global_scale):
+        return torch.ops.fbgemm.f4f4bf16(xq, wq, x_scale, w_scale, global_scale)
+
+    def quantize_and_compute(self, x, w):
+        xq, wq, x_scale, w_scale, global_scale = self.quantize(x, w)
+        return self.compute(xq, wq, x_scale, w_scale, global_scale)
+
+    @property
+    def name(self) -> str:
+        return "cutlass_f4f4bf16"
+
+    @property
+    def hip(self) -> bool:
+        # F4F4BF16 only supported for cuda.
+        return False
+
+    @property
+    def cuda(self) -> bool:
+        return True
@@ -162,3 +162,57 @@ def _quantize(
         wq, scales = _quantize(w, dtype=dtype)
 
     return wq, scales
+
+
+def scaled_fp4_quant(
+    input: torch.Tensor, input_global_scale: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize input tensor to FP4 and return quantized tensor and scale.
+    This function quantizes the last dimension of the given tensor `input`. For
+    every 16 consecutive elements, a single dynamically computed scaling factor
+    is shared. This scaling factor is quantized using the `input_global_scale`
+    and is stored in a swizzled layout (see
+    https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x).
+    Args:
+        input: The input tensor to be quantized to FP4
+        input_global_scale: A scalar scaling factor for the entire tensor.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The output tensor in FP4 but every
+            two values are packed into a uint8 and float8_e4m3 scaling factors
+            in the sizzled layout.
+    """
+    assert input.ndim >= 1, f"input.ndim needs to be >= 1, but got {input.ndim}."
+    other_dims = 1 if input.ndim == 1 else -1
+    input = input.reshape(other_dims, input.shape[-1])
+    m, n = input.shape
+    block_size = 16
+    device = input.device
+
+    assert n % block_size == 0, f"last dim has to be multiple of 16, but got {n}."
+    assert input.dtype in (
+        torch.float16,
+        torch.bfloat16,
+    ), f"input.dtype needs to be fp16 or bf16 but got {input.dtype}."
+
+    # Two fp4 values will be packed into an uint8.
+    output = torch.empty((m, n // 2), device=device, dtype=torch.uint8)
+
+    # We use the rounded values to store the swizzled values. Due to the
+    # requirement of the Tensor Core, the minimum tile is 128x4 for the scales.
+    # So, we first pad the scales to multiples of 128 and 4. Then, the scales
+    # (in float8_e4m3fn) are packed into an int32 for every 4 values. More:
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-mma-scale-factor-b-layout-4x
+    def round_up(x: int, y: int) -> int:
+        return (x + y - 1) // y * y
+
+    rounded_m = round_up(m, 128)
+    scale_n = n // block_size
+    rounded_n = round_up(scale_n, 4)
+    output_scale = torch.empty(
+        (rounded_m, rounded_n // 4), device=device, dtype=torch.int32
+    )
+
+    torch.ops.fbgemm.scaled_fp4_quant(output, input, output_scale, input_global_scale)
+    output_scale = output_scale.view(torch.float8_e4m3fn)
+    return output, output_scale
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cutlass/util/device_memory.h>
+// clang-format on
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+#include "f4f4bf16/f4f4bf16_manifest.cuh"
+#endif
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor dispatch_f4f4bf16_kernel(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor global_scale,
+    bool use_mx = false) {
+  auto M = XQ.size(0);
+  auto K = XQ.size(1);
+  auto N = WQ.size(0);
+  auto BLOCK_SIZE = 16;
+  TORCH_CHECK(
+      N % BLOCK_SIZE == 0 && K % BLOCK_SIZE == 0,
+      "Weight dimensions N and K must be multiples of block size 16");
+
+  // MXFP4
+  if (use_mx) {
+    if (M <= 128) {
+      if (N <= 1024) {
+        return f4f4bf16_256_128_2_4_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 2048) {
+        return f4f4bf16_256_192_4_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_128_128_4_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 2048) {
+      if (N <= 2048) {
+        return f4f4bf16_256_128_2_2_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_128_256_2_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_256_2_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 4096) {
+      if (N <= 4096) {
+        return f4f4bf16_256_256_4_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_256_256_2_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_128_2_4_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 8192) {
+      if (N <= 4096) {
+        return f4f4bf16_256_256_2_2_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_256_256_2_4_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_128_256_2_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 16384) {
+      if (N <= 2048) {
+        return f4f4bf16_256_256_2_4_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_128_192_2_2_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_128_256_2_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 32768) {
+      if (N <= 1024) {
+        return f4f4bf16_256_256_2_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 4096) {
+        return f4f4bf16_128_192_2_2_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_192_4_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 65536) {
+      if (N <= 2048) {
+        return f4f4bf16_256_192_2_4_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 4096) {
+        return f4f4bf16_256_192_2_2_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_256_2_1_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else {
+      if (N <= 1024) {
+        return f4f4bf16_256_192_2_4_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_256_2_2_1_t(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    }
+  }
+  // NVFP4
+  else {
+    if (M <= 128) {
+      if (N <= 1024) {
+        return f4f4bf16_256_128_2_4_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 2048) {
+        return f4f4bf16_256_192_4_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_128_128_4_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 2048) {
+      if (N <= 2048) {
+        return f4f4bf16_256_128_2_2_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_128_256_2_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_256_2_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 4096) {
+      if (N <= 4096) {
+        return f4f4bf16_256_256_4_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_256_256_2_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_128_2_4_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 8192) {
+      if (N <= 4096) {
+        return f4f4bf16_256_256_2_2_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_256_256_2_4_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_128_256_2_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 16384) {
+      if (N <= 2048) {
+        return f4f4bf16_256_256_2_4_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 8192) {
+        return f4f4bf16_128_192_2_2_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_128_256_2_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 32768) {
+      if (N <= 1024) {
+        return f4f4bf16_256_256_2_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 4096) {
+        return f4f4bf16_128_192_2_2_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_192_4_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else if (M <= 65536) {
+      if (N <= 2048) {
+        return f4f4bf16_256_192_2_4_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else if (N <= 4096) {
+        return f4f4bf16_256_192_2_2_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_256_2_1_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    } else {
+      if (N <= 1024) {
+        return f4f4bf16_256_192_2_4_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      } else {
+        return f4f4bf16_256_256_2_2_1_f(XQ, WQ, x_scale, w_scale, global_scale);
+      }
+    }
+  }
+}
+
+at::Tensor f4f4bf16(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor global_scale,
+    bool use_mx = false) {
+  return dispatch_f4f4bf16_kernel(
+      XQ, WQ, x_scale, w_scale, global_scale, use_mx);
+}
+
+#else
+
+at::Tensor f4f4bf16(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor global_scale,
+    bool use_mx = false) {
+  throw std::runtime_error(
+      "CUDA version is older than 12.8"); // requires CUDA>=12.8
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_4_1_1_f(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor global_scale) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<128, 128, 4, 1, 1, false>(
+      XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "f4f4bf16_common.cuh"
+
+namespace fbgemm_gpu {
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12080)
+
+at::Tensor f4f4bf16_128_128_4_1_1_t(
+    at::Tensor XQ, // FP4
+    at::Tensor WQ, // FP4
+    at::Tensor x_scale,
+    at::Tensor w_scale,
+    at::Tensor global_scale) {
+  // Dispatch this kernel to the correct underlying implementation.
+  return _f4f4bf16<128, 128, 4, 1, 1, true>(
+      XQ, WQ, x_scale, w_scale, global_scale);
+}
+
+#endif
+
+} // namespace fbgemm_gpu